dongsheng@623: #!/usr/bin/perl dongsheng@623: dongsheng@623: # Po4a::Xml.pm dongsheng@623: # dongsheng@623: # extract and translate translatable strings from XML documents. dongsheng@623: # dongsheng@623: # This code extracts plain text from tags and attributes from generic dongsheng@623: # XML documents, and it can be used as a base to build modules for dongsheng@623: # XML-based documents. dongsheng@623: # dongsheng@623: # Copyright (c) 2004 by Jordi Vilalta dongsheng@623: # Copyright (c) 2008-2009 by Nicolas François dongsheng@623: # dongsheng@623: # This program is free software; you can redistribute it and/or modify dongsheng@623: # it under the terms of the GNU General Public License as published by dongsheng@623: # the Free Software Foundation; either version 2 of the License, or dongsheng@623: # (at your option) any later version. dongsheng@623: # dongsheng@623: # This program is distributed in the hope that it will be useful, dongsheng@623: # but WITHOUT ANY WARRANTY; without even the implied warranty of dongsheng@623: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the dongsheng@623: # GNU General Public License for more details. dongsheng@623: # dongsheng@623: # You should have received a copy of the GNU General Public License dongsheng@623: # along with this program; if not, write to the Free Software dongsheng@623: # Foundation, Inc., dongsheng@623: # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dongsheng@623: # dongsheng@623: ######################################################################## dongsheng@623: dongsheng@623: =head1 NAME dongsheng@623: dongsheng@623: Locale::Po4a::Xml - Convert XML documents and derivates from/to PO files dongsheng@623: dongsheng@623: =head1 DESCRIPTION dongsheng@623: dongsheng@623: The po4a (po for anything) project goal is to ease translations (and more dongsheng@623: interestingly, the maintenance of translations) using gettext tools on dongsheng@623: areas where they were not expected like documentation. dongsheng@623: dongsheng@623: Locale::Po4a::Xml is a module to help the translation of XML documents into dongsheng@623: other [human] languages. It can also be used as a base to build modules for dongsheng@623: XML-based documents. dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: package Locale::Po4a::Xml; dongsheng@623: dongsheng@623: use 5.006; dongsheng@623: use strict; dongsheng@623: use warnings; dongsheng@623: dongsheng@623: require Exporter; dongsheng@623: use vars qw(@ISA @EXPORT); dongsheng@623: @ISA = qw(Locale::Po4a::TransTractor); dongsheng@623: @EXPORT = qw(new initialize @tag_types); dongsheng@623: dongsheng@623: use Locale::Po4a::TransTractor; dongsheng@623: use Locale::Po4a::Common; dongsheng@623: use Carp qw(croak); dongsheng@623: use File::Basename; dongsheng@623: use File::Spec; dongsheng@623: dongsheng@623: #It will mantain the path from the root tag to the current one dongsheng@623: my @path; dongsheng@623: dongsheng@623: #It will contain a list of external entities and their attached paths dongsheng@623: my %entities; dongsheng@623: dongsheng@623: my @comments; dongsheng@623: dongsheng@623: sub shiftline { dongsheng@623: my $self = shift; dongsheng@623: # call Transtractor's shiftline dongsheng@623: my ($line,$ref) = $self->SUPER::shiftline(); dongsheng@623: return ($line,$ref) if (not defined $line); dongsheng@623: dongsheng@623: for my $k (keys %entities) { dongsheng@623: if ($line =~ m/^(.*?)&$k;(.*)$/s) { dongsheng@623: my ($before, $after) = ($1, $2); dongsheng@623: my $linenum=0; dongsheng@623: my @textentries; dongsheng@623: dongsheng@623: open (my $in, $entities{$k}) dongsheng@623: or croak wrap_mod("po4a::xml", dongsheng@623: dgettext("po4a", "Can't read from %s: %s"), dongsheng@623: $entities{$k}, $!); dongsheng@623: while (defined (my $textline = <$in>)) { dongsheng@623: $linenum++; dongsheng@623: my $textref=$entities{$k}.":$linenum"; dongsheng@623: push @textentries, ($textline,$textref); dongsheng@623: } dongsheng@623: close $in dongsheng@623: or croak wrap_mod("po4a::xml", dongsheng@623: dgettext("po4a", "Can't close %s after reading: %s"), dongsheng@623: $entities{$k}, $!); dongsheng@623: dongsheng@623: push @textentries, ($after, $ref); dongsheng@623: $line = $before.(shift @textentries); dongsheng@623: $ref .= " ".(shift @textentries); dongsheng@623: $self->unshiftline(@textentries); dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: return ($line,$ref); dongsheng@623: } dongsheng@623: dongsheng@623: sub read { dongsheng@623: my ($self,$filename)=@_; dongsheng@623: push @{$self->{DOCPOD}{infile}}, $filename; dongsheng@623: $self->Locale::Po4a::TransTractor::read($filename); dongsheng@623: } dongsheng@623: dongsheng@623: sub parse { dongsheng@623: my $self=shift; dongsheng@623: map {$self->parse_file($_)} @{$self->{DOCPOD}{infile}}; dongsheng@623: } dongsheng@623: dongsheng@623: # @save_holders is a stack of references to ('paragraph', 'translation', dongsheng@623: # 'sub_translations', 'open', 'close', 'folded_attributes') hashes, where: dongsheng@623: # paragraph is a reference to an array (see paragraph in the dongsheng@623: # treat_content() subroutine) of strings followed by dongsheng@623: # references. It contains the @paragraph array as it was dongsheng@623: # before the processing was interrupted by a tag instroducing dongsheng@623: # a placeholder. dongsheng@623: # translation is the translation of this level up to now dongsheng@623: # sub_translations is a reference to an array of strings containing the dongsheng@623: # translations which must replace the placeholders. dongsheng@623: # open is the tag which opened the placeholder. dongsheng@623: # close is the tag which closed the placeholder. dongsheng@623: # folded_attributes is an hash of tags with their attributes ( dongsheng@623: # strings), referenced by the folded tag id, which should dongsheng@623: # replace the strings in the current dongsheng@623: # translation. dongsheng@623: # dongsheng@623: # If @save_holders only has 1 holder, then we are not processing the dongsheng@623: # content of an holder, we are translating the document. dongsheng@623: my @save_holders; dongsheng@623: dongsheng@623: dongsheng@623: # If we are at the bottom of the stack and there is no in dongsheng@623: # the current translation, we can push the translation in the translated dongsheng@623: # document. dongsheng@623: # Otherwise, we keep the translation in the current holder. dongsheng@623: sub pushline { dongsheng@623: my ($self, $line) = (shift, shift); dongsheng@623: dongsheng@623: my $holder = $save_holders[$#save_holders]; dongsheng@623: my $translation = $holder->{'translation'}; dongsheng@623: $translation .= $line; dongsheng@623: dongsheng@623: while ( %{$holder->{folded_attributes}} dongsheng@623: and $translation =~ m/^(.*)<([^>]+?)\s+po4a-id=([0-9]+)>(.*)$/s) { dongsheng@623: my $begin = $1; dongsheng@623: my $tag = $2; dongsheng@623: my $id = $3; dongsheng@623: my $end = $4; dongsheng@623: if (defined $holder->{folded_attributes}->{$id}) { dongsheng@623: # TODO: check if the tag is the same dongsheng@623: $translation = $begin.$holder->{folded_attributes}->{$id}.$end; dongsheng@623: delete $holder->{folded_attributes}->{$id}; dongsheng@623: } else { dongsheng@623: # TODO: It will be hard to identify the location. dongsheng@623: # => find a way to retrieve the reference. dongsheng@623: die wrap_mod("po4a::xml", dgettext("po4a", "'po4a-id=%d' in the translation does not exist in the original string (or 'po4a-id=%d' used twice in the translation)."), $id, $id); dongsheng@623: } dongsheng@623: } dongsheng@623: # TODO: check that %folded_attributes is empty at some time dongsheng@623: # => in translate_paragraph? dongsheng@623: dongsheng@623: if ( ($#save_holders > 0) dongsheng@623: or ($translation =~ m//s)) { dongsheng@623: $holder->{'translation'} = $translation; dongsheng@623: } else { dongsheng@623: $self->SUPER::pushline($translation); dongsheng@623: $holder->{'translation'} = ''; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: =head1 TRANSLATING WITH PO4A::XML dongsheng@623: dongsheng@623: This module can be used directly to handle generic XML documents. This will dongsheng@623: extract all tag's content, and no attributes, since it's where the text is dongsheng@623: written in most XML based documents. dongsheng@623: dongsheng@623: There are some options (described in the next section) that can customize dongsheng@623: this behavior. If this doesn't fit to your document format you're encouraged dongsheng@623: to write your own module derived from this, to describe your format's details. dongsheng@623: See the section "Writing derivate modules" below, for the process description. dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: # dongsheng@623: # Parse file and translate it dongsheng@623: # dongsheng@623: sub parse_file { dongsheng@623: my ($self,$filename) = @_; dongsheng@623: my $eof = 0; dongsheng@623: dongsheng@623: while (!$eof) { dongsheng@623: # We get all the text until the next breaking tag (not dongsheng@623: # inline) and translate it dongsheng@623: $eof = $self->treat_content; dongsheng@623: if (!$eof) { dongsheng@623: # And then we treat the following breaking tag dongsheng@623: $eof = $self->treat_tag; dongsheng@623: } dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: =head1 OPTIONS ACCEPTED BY THIS MODULE dongsheng@623: dongsheng@623: The global debug option causes this module to show the excluded strings, in dongsheng@623: order to see if it skips something important. dongsheng@623: dongsheng@623: These are this module's particular options: dongsheng@623: dongsheng@623: =over 4 dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Prevents it to strip the spaces around the extracted strings. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Canonizes the string to translate, considering that whitespaces are not dongsheng@623: important, and wraps the translated document. This option can be overridden dongsheng@623: by custom tag options. See the "tags" option below. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: It makes the tags and attributes searching to work in a case insensitive dongsheng@623: way. If it's defined, it will treat EBooKElaNG and EBOOKELang as EbookElang. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: When defined, external entities are included in the generated (translated) dongsheng@623: document, and for the extraction of strings. If it's not defined, you dongsheng@623: will have to translate external entities separately as independent dongsheng@623: documents. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: This option defines the behavior of the module when it encounter a invalid dongsheng@623: Xml syntax (a closing tag which does not match the last opening tag, or a dongsheng@623: tag's attribute without value). dongsheng@623: It can take the following values: dongsheng@623: dongsheng@623: =over dongsheng@623: dongsheng@623: =item I dongsheng@623: dongsheng@623: This is the default value. dongsheng@623: The module will exit with an error. dongsheng@623: dongsheng@623: =item I dongsheng@623: dongsheng@623: The module will continue, and will issue a warning. dongsheng@623: dongsheng@623: =item I dongsheng@623: dongsheng@623: The module will continue without any warnings. dongsheng@623: dongsheng@623: =back dongsheng@623: dongsheng@623: Be careful when using this option. dongsheng@623: It is generally recommended to fix the input file. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Extracts only the specified tags in the "tags" option. Otherwise, it dongsheng@623: will extract all the tags except the ones specified. dongsheng@623: dongsheng@623: Note: This option is deprecated. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: String that will try to match with the first line of the document's doctype dongsheng@623: (if defined). If it doesn't, a warning will indicate that the document dongsheng@623: might be of a bad type. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Space-separated list of tags you want to translate or skip. By default, dongsheng@623: the specified tags will be excluded, but if you use the "tagsonly" option, dongsheng@623: the specified tags will be the only ones included. The tags must be in the dongsheng@623: form EaaaE, but you can join some (EbbbEEaaaE) to say that the content of dongsheng@623: the tag EaaaE will only be translated when it's into a EbbbE tag. dongsheng@623: dongsheng@623: You can also specify some tag options putting some characters in front of dongsheng@623: the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap) dongsheng@623: to override the default behavior specified by the global "wrap" option. dongsheng@623: dongsheng@623: Example: WEchapterEEtitleE dongsheng@623: dongsheng@623: Note: This option is deprecated. dongsheng@623: You should use the B and B options instead. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Space-separated list of tag's attributes you want to translate. You can dongsheng@623: specify the attributes by their name (for example, "lang"), but you can dongsheng@623: prefix it with a tag hierarchy, to specify that this attribute will only be dongsheng@623: translated when it's into the specified tag. For example: EbbbEEaaaElang dongsheng@623: specifies that the lang attribute will only be translated if it's into an dongsheng@623: EaaaE tag, and it's into a EbbbE tag. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Do not translate attributes in inline tags. dongsheng@623: Instead, replace all attributes of a tag by po4a-id=. dongsheng@623: dongsheng@623: This is useful when attributes shall not be translated, as this simplifies the dongsheng@623: strings for translators, and avoids typos. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Space-separated list of tags which should break the sequence. dongsheng@623: By default, all tags break the sequence. dongsheng@623: dongsheng@623: The tags must be in the form , but you can join some dongsheng@623: (), if a tag () should only be considered dongsheng@623: when it's into another tag (). dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Space-separated list of tags which should be treated as inline. dongsheng@623: By default, all tags break the sequence. dongsheng@623: dongsheng@623: The tags must be in the form , but you can join some dongsheng@623: (), if a tag () should only be considered dongsheng@623: when it's into another tag (). dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Space-separated list of tags which should be treated as placeholders. dongsheng@623: Placeholders do not break the sequence, but the content of placeholders is dongsheng@623: translated separately. dongsheng@623: dongsheng@623: The location of the placeholder in its blocks will be marked with a string dongsheng@623: similar to: dongsheng@623: dongsheng@623: dongsheng@623: dongsheng@623: The tags must be in the form , but you can join some dongsheng@623: (), if a tag () should only be considered dongsheng@623: when it's into another tag (). dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Space separated list of tags that the module should not try to set by dongsheng@623: default in any category. dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Support C preprocessor directives. dongsheng@623: When this option is set, po4a will consider preprocessor directives as dongsheng@623: paragraph separators. dongsheng@623: This is important if the XML file must be preprocessed because otherwise dongsheng@623: the directives may be inserted in the middle of lines if po4a consider it dongsheng@623: belong to the current paragraph, and they won't be recognized by the dongsheng@623: preprocessor. dongsheng@623: Note: the preprocessor directives must only appear between tags dongsheng@623: (they must not break a tag). dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Space-separated list of tags you want to translate. dongsheng@623: dongsheng@623: The tags must be in the form , but you can join some dongsheng@623: (), if a tag () should only be considered dongsheng@623: when it's into another tag (). dongsheng@623: dongsheng@623: You can also specify some tag options putting some characters in front of dongsheng@623: the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap) dongsheng@623: to overide the default behavior specified by the global "wrap" option. dongsheng@623: dongsheng@623: Example: WEchapterEEtitleE dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: Space-separated list of tags you do not want to translate. dongsheng@623: dongsheng@623: The tags must be in the form , but you can join some dongsheng@623: (), if a tag () should only be considered dongsheng@623: when it's into another tag (). dongsheng@623: dongsheng@623: =item B dongsheng@623: dongsheng@623: The default categories for tags that are not in any of the translated, dongsheng@623: untranslated, break, inline, or placeholder. dongsheng@623: dongsheng@623: This is a set of letters: dongsheng@623: dongsheng@623: =over dongsheng@623: dongsheng@623: =item I dongsheng@623: dongsheng@623: Tags should be translated and content can be re-wrapped. dongsheng@623: dongsheng@623: =item I dongsheng@623: dongsheng@623: Tags should be translated and content should not be re-wrapped. dongsheng@623: dongsheng@623: =item I dongsheng@623: dongsheng@623: Tags should be translated inline. dongsheng@623: dongsheng@623: =item I

dongsheng@623: dongsheng@623: Tags should be translated as placeholders. dongsheng@623: dongsheng@623: =back dongsheng@623: dongsheng@623: =back dongsheng@623: dongsheng@623: =cut dongsheng@623: # TODO: defaulttranslateoption dongsheng@623: # w => indicate that it is only valid for translatable tags and do not dongsheng@623: # care about inline/break/placeholder? dongsheng@623: # ... dongsheng@623: dongsheng@623: sub initialize { dongsheng@623: my $self = shift; dongsheng@623: my %options = @_; dongsheng@623: dongsheng@623: # Reset the path dongsheng@623: @path = (); dongsheng@623: dongsheng@623: # Initialize the stack of holders dongsheng@623: my @paragraph = (); dongsheng@623: my @sub_translations = (); dongsheng@623: my %folded_attributes; dongsheng@623: my %holder = ('paragraph' => \@paragraph, dongsheng@623: 'translation' => "", dongsheng@623: 'sub_translations' => \@sub_translations, dongsheng@623: 'folded_attributes' => \%folded_attributes); dongsheng@623: @save_holders = (\%holder); dongsheng@623: dongsheng@623: $self->{options}{'nostrip'}=0; dongsheng@623: $self->{options}{'wrap'}=0; dongsheng@623: $self->{options}{'caseinsensitive'}=0; dongsheng@623: $self->{options}{'tagsonly'}=0; dongsheng@623: $self->{options}{'tags'}=''; dongsheng@623: $self->{options}{'break'}=''; dongsheng@623: $self->{options}{'translated'}=''; dongsheng@623: $self->{options}{'untranslated'}=''; dongsheng@623: $self->{options}{'defaulttranslateoption'}=''; dongsheng@623: $self->{options}{'attributes'}=''; dongsheng@623: $self->{options}{'foldattributes'}=0; dongsheng@623: $self->{options}{'inline'}=''; dongsheng@623: $self->{options}{'placeholder'}=''; dongsheng@623: $self->{options}{'doctype'}=''; dongsheng@623: $self->{options}{'nodefault'}=''; dongsheng@623: $self->{options}{'includeexternal'}=0; dongsheng@623: $self->{options}{'ontagerror'}="fail"; dongsheng@623: $self->{options}{'cpp'}=0; dongsheng@623: dongsheng@623: $self->{options}{'verbose'}=''; dongsheng@623: $self->{options}{'debug'}=''; dongsheng@623: dongsheng@623: foreach my $opt (keys %options) { dongsheng@623: if ($options{$opt}) { dongsheng@623: die wrap_mod("po4a::xml", dongsheng@623: dgettext("po4a", "Unknown option: %s"), $opt) dongsheng@623: unless exists $self->{options}{$opt}; dongsheng@623: $self->{options}{$opt} = $options{$opt}; dongsheng@623: } dongsheng@623: } dongsheng@623: # Default options set by modules. Forbidden for users. dongsheng@623: $self->{options}{'_default_translated'}=''; dongsheng@623: $self->{options}{'_default_untranslated'}=''; dongsheng@623: $self->{options}{'_default_break'}=''; dongsheng@623: $self->{options}{'_default_inline'}=''; dongsheng@623: $self->{options}{'_default_placeholder'}=''; dongsheng@623: $self->{options}{'_default_attributes'}=''; dongsheng@623: dongsheng@623: #It will maintain the list of the translatable tags dongsheng@623: $self->{tags}=(); dongsheng@623: $self->{translated}=(); dongsheng@623: $self->{untranslated}=(); dongsheng@623: #It will maintain the list of the translatable attributes dongsheng@623: $self->{attributes}=(); dongsheng@623: #It will maintain the list of the breaking tags dongsheng@623: $self->{break}=(); dongsheng@623: #It will maintain the list of the inline tags dongsheng@623: $self->{inline}=(); dongsheng@623: #It will maintain the list of the placeholder tags dongsheng@623: $self->{placeholder}=(); dongsheng@623: #list of the tags that must not be set in the tags or inline category dongsheng@623: #by this module or sub-module (unless specified in an option) dongsheng@623: $self->{nodefault}=(); dongsheng@623: dongsheng@623: $self->treat_options; dongsheng@623: } dongsheng@623: dongsheng@623: =head1 WRITING DERIVATE MODULES dongsheng@623: dongsheng@623: =head2 DEFINE WHAT TAGS AND ATTRIBUTES TO TRANSLATE dongsheng@623: dongsheng@623: The simplest customization is to define which tags and attributes you want dongsheng@623: the parser to translate. This should be done in the initialize function. dongsheng@623: First you should call the main initialize, to get the command-line options, dongsheng@623: and then, append your custom definitions to the options hash. If you want dongsheng@623: to treat some new options from command line, you should define them before dongsheng@623: calling the main initialize: dongsheng@623: dongsheng@623: $self->{options}{'new_option'}=''; dongsheng@623: $self->SUPER::initialize(%options); dongsheng@623: $self->{options}{'_default_translated'}.='

'; dongsheng@623: $self->{options}{'attributes'}.=' <p>lang id'; dongsheng@623: $self->{options}{'_default_inline'}.=' <br>'; dongsheng@623: $self->treat_options; dongsheng@623: dongsheng@623: You should use the B<_default_inline>, B<_default_break>, dongsheng@623: B<_default_placeholder>, B<_default_translated>, B<_default_untranslated>, dongsheng@623: and B<_default_attributes> options in derivated modules. This allow users dongsheng@623: to override the default behavior defined in your module with command line dongsheng@623: options. dongsheng@623: dongsheng@623: =head2 OVERRIDING THE found_string FUNCTION dongsheng@623: dongsheng@623: Another simple step is to override the function "found_string", which dongsheng@623: receives the extracted strings from the parser, in order to translate them. dongsheng@623: There you can control which strings you want to translate, and perform dongsheng@623: transformations to them before or after the translation itself. dongsheng@623: dongsheng@623: It receives the extracted text, the reference on where it was, and a hash dongsheng@623: that contains extra information to control what strings to translate, how dongsheng@623: to translate them and to generate the comment. dongsheng@623: dongsheng@623: The content of these options depends on the kind of string it is (specified in an dongsheng@623: entry of this hash): dongsheng@623: dongsheng@623: =over dongsheng@623: dongsheng@623: =item type="tag" dongsheng@623: dongsheng@623: The found string is the content of a translatable tag. The entry "tag_options" dongsheng@623: contains the option characters in front of the tag hierarchy in the module dongsheng@623: "tags" option. dongsheng@623: dongsheng@623: =item type="attribute" dongsheng@623: dongsheng@623: Means that the found string is the value of a translatable attribute. The dongsheng@623: entry "attribute" has the name of the attribute. dongsheng@623: dongsheng@623: =back dongsheng@623: dongsheng@623: It must return the text that will replace the original in the translated dongsheng@623: document. Here's a basic example of this function: dongsheng@623: dongsheng@623: sub found_string { dongsheng@623: my ($self,$text,$ref,$options)=@_; dongsheng@623: $text = $self->translate($text,$ref,"type ".$options->{'type'}, dongsheng@623: 'wrap'=>$self->{options}{'wrap'}); dongsheng@623: return $text; dongsheng@623: } dongsheng@623: dongsheng@623: There's another simple example in the new Dia module, which only filters dongsheng@623: some strings. dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub found_string { dongsheng@623: my ($self,$text,$ref,$options)=@_; dongsheng@623: dongsheng@623: if ($text =~ m/^\s*$/s) { dongsheng@623: return $text; dongsheng@623: } dongsheng@623: dongsheng@623: my $comment; dongsheng@623: my $wrap = $self->{options}{'wrap'}; dongsheng@623: dongsheng@623: if ($options->{'type'} eq "tag") { dongsheng@623: $comment = "Content of: ".$self->get_path; dongsheng@623: dongsheng@623: if($options->{'tag_options'} =~ /w/) { dongsheng@623: $wrap = 1; dongsheng@623: } dongsheng@623: if($options->{'tag_options'} =~ /W/) { dongsheng@623: $wrap = 0; dongsheng@623: } dongsheng@623: } elsif ($options->{'type'} eq "attribute") { dongsheng@623: $comment = "Attribute '".$options->{'attribute'}."' of: ".$self->get_path; dongsheng@623: } elsif ($options->{'type'} eq "CDATA") { dongsheng@623: $comment = "CDATA"; dongsheng@623: $wrap = 0; dongsheng@623: } else { dongsheng@623: die wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Internal error: unknown type identifier '%s'."), $options->{'type'}); dongsheng@623: } dongsheng@623: $text = $self->translate($text,$ref,$comment,'wrap'=>$wrap, comment => $options->{'comments'}); dongsheng@623: return $text; dongsheng@623: } dongsheng@623: dongsheng@623: =head2 MODIFYING TAG TYPES (TODO) dongsheng@623: dongsheng@623: This is a more complex one, but it enables a (almost) total customization. dongsheng@623: It's based in a list of hashes, each one defining a tag type's behavior. The dongsheng@623: list should be sorted so that the most general tags are after the most dongsheng@623: concrete ones (sorted first by the beginning and then by the end keys). To dongsheng@623: define a tag type you'll have to make a hash with the following keys: dongsheng@623: dongsheng@623: =over 4 dongsheng@623: dongsheng@623: =item beginning dongsheng@623: dongsheng@623: Specifies the beginning of the tag, after the "E<lt>". dongsheng@623: dongsheng@623: =item end dongsheng@623: dongsheng@623: Specifies the end of the tag, before the "E<gt>". dongsheng@623: dongsheng@623: =item breaking dongsheng@623: dongsheng@623: It says if this is a breaking tag class. A non-breaking (inline) tag is one dongsheng@623: that can be taken as part of the content of another tag. It can take the dongsheng@623: values false (0), true (1) or undefined. If you leave this undefined, you'll dongsheng@623: have to define the f_breaking function that will say whether a concrete tag of dongsheng@623: this class is a breaking tag or not. dongsheng@623: dongsheng@623: =item f_breaking dongsheng@623: dongsheng@623: It's a function that will tell if the next tag is a breaking one or not. It dongsheng@623: should be defined if the "breaking" option is not. dongsheng@623: dongsheng@623: =item f_extract dongsheng@623: dongsheng@623: If you leave this key undefined, the generic extraction function will have to dongsheng@623: extract the tag itself. It's useful for tags that can have other tags or dongsheng@623: special structures in them, so that the main parser doesn't get mad. This dongsheng@623: function receives a boolean that says if the tag should be removed from the dongsheng@623: input stream or not. dongsheng@623: dongsheng@623: =item f_translate dongsheng@623: dongsheng@623: This function receives the tag (in the get_string_until() format) and returns dongsheng@623: the translated tag (translated attributes or all needed transformations) as a dongsheng@623: single string. dongsheng@623: dongsheng@623: =back dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: ##### Generic XML tag types #####' dongsheng@623: dongsheng@623: our @tag_types = ( dongsheng@623: { beginning => "!--#", dongsheng@623: end => "--", dongsheng@623: breaking => 0, dongsheng@623: f_extract => \&tag_extract_comment, dongsheng@623: f_translate => \&tag_trans_comment}, dongsheng@623: { beginning => "!--", dongsheng@623: end => "--", dongsheng@623: breaking => 0, dongsheng@623: f_extract => \&tag_extract_comment, dongsheng@623: f_translate => \&tag_trans_comment}, dongsheng@623: { beginning => "?xml", dongsheng@623: end => "?", dongsheng@623: breaking => 1, dongsheng@623: f_translate => \&tag_trans_xmlhead}, dongsheng@623: { beginning => "?", dongsheng@623: end => "?", dongsheng@623: breaking => 1, dongsheng@623: f_translate => \&tag_trans_procins}, dongsheng@623: { beginning => "!DOCTYPE", dongsheng@623: end => "", dongsheng@623: breaking => 1, dongsheng@623: f_extract => \&tag_extract_doctype, dongsheng@623: f_translate => \&tag_trans_doctype}, dongsheng@623: { beginning => "![CDATA[", dongsheng@623: end => "", dongsheng@623: breaking => 1, dongsheng@623: f_extract => \&CDATA_extract, dongsheng@623: f_translate => \&CDATA_trans}, dongsheng@623: { beginning => "/", dongsheng@623: end => "", dongsheng@623: f_breaking => \&tag_break_close, dongsheng@623: f_translate => \&tag_trans_close}, dongsheng@623: { beginning => "", dongsheng@623: end => "/", dongsheng@623: f_breaking => \&tag_break_alone, dongsheng@623: f_translate => \&tag_trans_alone}, dongsheng@623: { beginning => "", dongsheng@623: end => "", dongsheng@623: f_breaking => \&tag_break_open, dongsheng@623: f_translate => \&tag_trans_open} dongsheng@623: ); dongsheng@623: dongsheng@623: sub tag_extract_comment { dongsheng@623: my ($self,$remove)=(shift,shift); dongsheng@623: my ($eof,@tag)=$self->get_string_until('-->',{include=>1,remove=>$remove}); dongsheng@623: return ($eof,@tag); dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_trans_comment { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: return $self->join_lines(@tag); dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_trans_xmlhead { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: dongsheng@623: # We don't have to translate anything from here: throw away references dongsheng@623: my $tag = $self->join_lines(@tag); dongsheng@623: $tag =~ /encoding=(("|')|)(.*?)(\s|\2)/s; dongsheng@623: my $in_charset=$3; dongsheng@623: $self->detected_charset($in_charset); dongsheng@623: my $out_charset=$self->get_out_charset; dongsheng@623: dongsheng@623: if (defined $in_charset) { dongsheng@623: $tag =~ s/$in_charset/$out_charset/; dongsheng@623: } else { dongsheng@623: if ($tag =~ m/standalone/) { dongsheng@623: $tag =~ s/(standalone)/encoding="$out_charset" $1/; dongsheng@623: } else { dongsheng@623: $tag.= " encoding=\"$out_charset\""; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: return $tag; dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_trans_procins { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: return $self->join_lines(@tag); dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_extract_doctype { dongsheng@623: my ($self,$remove)=(shift,shift); dongsheng@623: dongsheng@623: # Check if there is an internal subset (between []). dongsheng@623: my ($eof,@tag)=$self->get_string_until('>',{include=>1,unquoted=>1}); dongsheng@623: my $parity = 0; dongsheng@623: my $paragraph = ""; dongsheng@623: map { $parity = 1 - $parity; $paragraph.= $parity?$_:""; } @tag; dongsheng@623: my $found = 0; dongsheng@623: if ($paragraph =~ m/<.*\[.*</s) { dongsheng@623: $found = 1 dongsheng@623: } dongsheng@623: dongsheng@623: if (not $found) { dongsheng@623: ($eof,@tag)=$self->get_string_until('>',{include=>1,remove=>$remove,unquoted=>1}); dongsheng@623: } else { dongsheng@623: ($eof,@tag)=$self->get_string_until(']\s*>',{include=>1,remove=>$remove,unquoted=>1,regex=>1}); dongsheng@623: } dongsheng@623: return ($eof,@tag); dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_trans_doctype { dongsheng@623: # This check is not really reliable. There are system and public dongsheng@623: # identifiers. Only the public one could be checked reliably. dongsheng@623: my ($self,@tag)=@_; dongsheng@623: if (defined $self->{options}{'doctype'} ) { dongsheng@623: my $doctype = $self->{options}{'doctype'}; dongsheng@623: if ( $tag[0] !~ /\Q$doctype\E/i ) { dongsheng@623: warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Bad document type. '%s' expected. You can fix this warning with a -o doctype option, or ignore this check with -o doctype=\"\"."), $doctype); dongsheng@623: } dongsheng@623: } dongsheng@623: my $i = 0; dongsheng@623: my $basedir = $tag[1]; dongsheng@623: $basedir =~ s/:[0-9]+$//; dongsheng@623: $basedir = dirname($basedir); dongsheng@623: dongsheng@623: while ( $i < $#tag ) { dongsheng@623: my $t = $tag[$i]; dongsheng@623: my $ref = $tag[$i+1]; dongsheng@623: if ( $t =~ /^(\s*<!ENTITY\s+)(.*)$/is ) { dongsheng@623: my $part1 = $1; dongsheng@623: my $part2 = $2; dongsheng@623: my $includenow = 0; dongsheng@623: my $file = 0; dongsheng@623: my $name = ""; dongsheng@623: if ($part2 =~ /^(%\s+)(.*)$/s ) { dongsheng@623: $part1.= $1; dongsheng@623: $part2 = $2; dongsheng@623: $includenow = 1; dongsheng@623: } dongsheng@623: $part2 =~ /^(\S+)(\s+)(.*)$/s; dongsheng@623: $name = $1; dongsheng@623: $part1.= $1.$2; dongsheng@623: $part2 = $3; dongsheng@623: if ( $part2 =~ /^(SYSTEM\s+)(.*)$/is ) { dongsheng@623: $part1.= $1; dongsheng@623: $part2 = $2; dongsheng@623: $file = 1; dongsheng@623: if ($self->{options}{'includeexternal'}) { dongsheng@623: $entities{$name} = $part2; dongsheng@623: $entities{$name} =~ s/^"?(.*?)".*$/$1/s; dongsheng@623: $entities{$name} = File::Spec->catfile($basedir, $entities{$name}); dongsheng@623: } dongsheng@623: } dongsheng@623: if ((not $file) and (not $includenow)) { dongsheng@623: if ($part2 =~ m/^\s*(["'])(.*)\1(\s*>.*)$/s) { dongsheng@623: my $comment = "Content of the $name entity"; dongsheng@623: my $quote = $1; dongsheng@623: my $text = $2; dongsheng@623: $part2 = $3; dongsheng@623: $text = $self->translate($text, dongsheng@623: $ref, dongsheng@623: $comment, dongsheng@623: 'wrap'=>1); dongsheng@623: $t = $part1."$quote$text$quote$part2"; dongsheng@623: } dongsheng@623: } dongsheng@623: # print $part1."\n"; dongsheng@623: # print $name."\n"; dongsheng@623: # print $part2."\n"; dongsheng@623: } dongsheng@623: $tag[$i] = $t; dongsheng@623: $i += 2; dongsheng@623: } dongsheng@623: return $self->join_lines(@tag); dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_break_close { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: my $struct = $self->get_path; dongsheng@623: my $options = $self->get_translate_options($struct); dongsheng@623: if ($options =~ m/[ip]/) { dongsheng@623: return 0; dongsheng@623: } else { dongsheng@623: return 1; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_trans_close { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: my $name = $self->get_tag_name(@tag); dongsheng@623: dongsheng@623: my $test = pop @path; dongsheng@623: if (!defined($test) || $test ne $name ) { dongsheng@623: my $ontagerror = $self->{options}{'ontagerror'}; dongsheng@623: if ($ontagerror eq "warn") { dongsheng@623: warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong. Continuing..."), $name); dongsheng@623: } elsif ($ontagerror ne "silent") { dongsheng@623: die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name); dongsheng@623: } dongsheng@623: } dongsheng@623: return $self->join_lines(@tag); dongsheng@623: } dongsheng@623: dongsheng@623: sub CDATA_extract { dongsheng@623: my ($self,$remove)=(shift,shift); dongsheng@623: my ($eof, @tag) = $self->get_string_until(']]>',{include=>1,unquoted=>0,remove=>$remove}); dongsheng@623: dongsheng@623: return ($eof, @tag); dongsheng@623: } dongsheng@623: dongsheng@623: sub CDATA_trans { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: return $self->found_string($self->join_lines(@tag), dongsheng@623: $tag[1], dongsheng@623: {'type' => "CDATA"}); dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_break_alone { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: my $struct = $self->get_path($self->get_tag_name(@tag)); dongsheng@623: if ($self->get_translate_options($struct) =~ m/i/) { dongsheng@623: return 0; dongsheng@623: } else { dongsheng@623: return 1; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_trans_alone { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: my $name = $self->get_tag_name(@tag); dongsheng@623: push @path, $name; dongsheng@623: dongsheng@623: $name = $self->treat_attributes(@tag); dongsheng@623: dongsheng@623: pop @path; dongsheng@623: return $name; dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_break_open { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: my $struct = $self->get_path($self->get_tag_name(@tag)); dongsheng@623: my $options = $self->get_translate_options($struct); dongsheng@623: if ($options =~ m/[ip]/) { dongsheng@623: return 0; dongsheng@623: } else { dongsheng@623: return 1; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: sub tag_trans_open { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: my $name = $self->get_tag_name(@tag); dongsheng@623: push @path, $name; dongsheng@623: dongsheng@623: $name = $self->treat_attributes(@tag); dongsheng@623: dongsheng@623: return $name; dongsheng@623: } dongsheng@623: dongsheng@623: ##### END of Generic XML tag types ##### dongsheng@623: dongsheng@623: =head1 INTERNAL FUNCTIONS used to write derivated parsers dongsheng@623: dongsheng@623: =head2 WORKING WITH TAGS dongsheng@623: dongsheng@623: =over 4 dongsheng@623: dongsheng@623: =item get_path() dongsheng@623: dongsheng@623: This function returns the path to the current tag from the document's root, dongsheng@623: in the form E<lt>htmlE<gt>E<lt>bodyE<gt>E<lt>pE<gt>. dongsheng@623: dongsheng@623: An additional array of tags (without brackets) can be passed in argument. dongsheng@623: These path elements are added to the end of the current path. dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub get_path { dongsheng@623: my $self = shift; dongsheng@623: my @add = @_; dongsheng@623: if ( @path > 0 or @add > 0 ) { dongsheng@623: return "<".join("><",@path,@add).">"; dongsheng@623: } else { dongsheng@623: return "outside any tag (error?)"; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: =item tag_type() dongsheng@623: dongsheng@623: This function returns the index from the tag_types list that fits to the next dongsheng@623: tag in the input stream, or -1 if it's at the end of the input file. dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub tag_type { dongsheng@623: my $self = shift; dongsheng@623: my ($line,$ref) = $self->shiftline(); dongsheng@623: my ($match1,$match2); dongsheng@623: my $found = 0; dongsheng@623: my $i = 0; dongsheng@623: dongsheng@623: if (!defined($line)) { return -1; } dongsheng@623: dongsheng@623: $self->unshiftline($line,$ref); dongsheng@623: my ($eof,@lines) = $self->get_string_until(">",{include=>1,unquoted=>1}); dongsheng@623: my $line2 = $self->join_lines(@lines); dongsheng@623: while (!$found && $i < @tag_types) { dongsheng@623: ($match1,$match2) = ($tag_types[$i]->{beginning},$tag_types[$i]->{end}); dongsheng@623: if ($line =~ /^<\Q$match1\E/) { dongsheng@623: if (!defined($tag_types[$i]->{f_extract})) { dongsheng@623: #print substr($line2,length($line2)-1-length($match2),1+length($match2))."\n"; dongsheng@623: if (defined($line2) and $line2 =~ /\Q$match2\E>$/) { dongsheng@623: $found = 1; dongsheng@623: #print "YES: <".$match1." ".$match2.">\n"; dongsheng@623: } else { dongsheng@623: #print "NO: <".$match1." ".$match2.">\n"; dongsheng@623: $i++; dongsheng@623: } dongsheng@623: } else { dongsheng@623: $found = 1; dongsheng@623: } dongsheng@623: } else { dongsheng@623: $i++; dongsheng@623: } dongsheng@623: } dongsheng@623: if (!$found) { dongsheng@623: #It should never enter here, unless you undefine the most dongsheng@623: #general tags (as <...>) dongsheng@623: die "po4a::xml: Unknown tag type: ".$line."\n"; dongsheng@623: } else { dongsheng@623: return $i; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: =item extract_tag($$) dongsheng@623: dongsheng@623: This function returns the next tag from the input stream without the beginning dongsheng@623: and end, in an array form, to maintain the references from the input file. It dongsheng@623: has two parameters: the type of the tag (as returned by tag_type) and a dongsheng@623: boolean, that indicates if it should be removed from the input stream. dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub extract_tag { dongsheng@623: my ($self,$type,$remove) = (shift,shift,shift); dongsheng@623: my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end}); dongsheng@623: my ($eof,@tag); dongsheng@623: if (defined($tag_types[$type]->{f_extract})) { dongsheng@623: ($eof,@tag) = &{$tag_types[$type]->{f_extract}}($self,$remove); dongsheng@623: } else { dongsheng@623: ($eof,@tag) = $self->get_string_until($match2.">",{include=>1,remove=>$remove,unquoted=>1}); dongsheng@623: } dongsheng@623: $tag[0] =~ /^<\Q$match1\E(.*)$/s; dongsheng@623: $tag[0] = $1; dongsheng@623: $tag[$#tag-1] =~ /^(.*)\Q$match2\E>$/s; dongsheng@623: $tag[$#tag-1] = $1; dongsheng@623: return ($eof,@tag); dongsheng@623: } dongsheng@623: dongsheng@623: =item get_tag_name(@) dongsheng@623: dongsheng@623: This function returns the name of the tag passed as an argument, in the array dongsheng@623: form returned by extract_tag. dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub get_tag_name { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: $tag[0] =~ /^(\S*)/; dongsheng@623: return $1; dongsheng@623: } dongsheng@623: dongsheng@623: =item breaking_tag() dongsheng@623: dongsheng@623: This function returns a boolean that says if the next tag in the input stream dongsheng@623: is a breaking tag or not (inline tag). It leaves the input stream intact. dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub breaking_tag { dongsheng@623: my $self = shift; dongsheng@623: my $break; dongsheng@623: dongsheng@623: my $type = $self->tag_type; dongsheng@623: if ($type == -1) { return 0; } dongsheng@623: dongsheng@623: #print "TAG TYPE = ".$type."\n"; dongsheng@623: $break = $tag_types[$type]->{breaking}; dongsheng@623: if (!defined($break)) { dongsheng@623: # This tag's breaking depends on its content dongsheng@623: my ($eof,@lines) = $self->extract_tag($type,0); dongsheng@623: $break = &{$tag_types[$type]->{f_breaking}}($self,@lines); dongsheng@623: } dongsheng@623: #print "break = ".$break."\n"; dongsheng@623: return $break; dongsheng@623: } dongsheng@623: dongsheng@623: =item treat_tag() dongsheng@623: dongsheng@623: This function translates the next tag from the input stream. Using each dongsheng@623: tag type's custom translation functions. dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub treat_tag { dongsheng@623: my $self = shift; dongsheng@623: my $type = $self->tag_type; dongsheng@623: dongsheng@623: my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end}); dongsheng@623: my ($eof,@lines) = $self->extract_tag($type,1); dongsheng@623: dongsheng@623: $lines[0] =~ /^(\s*)(.*)$/s; dongsheng@623: my $space1 = $1; dongsheng@623: $lines[0] = $2; dongsheng@623: $lines[$#lines-1] =~ /^(.*?)(\s*)$/s; dongsheng@623: my $space2 = $2; dongsheng@623: $lines[$#lines-1] = $1; dongsheng@623: dongsheng@623: # Calling this tag type's specific handling (translation of dongsheng@623: # attributes...) dongsheng@623: my $line = &{$tag_types[$type]->{f_translate}}($self,@lines); dongsheng@623: $self->pushline("<".$match1.$space1.$line.$space2.$match2.">"); dongsheng@623: return $eof; dongsheng@623: } dongsheng@623: dongsheng@623: =item tag_in_list($@) dongsheng@623: dongsheng@623: This function returns a string value that says if the first argument (a tag dongsheng@623: hierarchy) matches any of the tags from the second argument (a list of tags dongsheng@623: or tag hierarchies). If it doesn't match, it returns 0. Else, it returns the dongsheng@623: matched tag's options (the characters in front of the tag) or 1 (if that tag dongsheng@623: doesn't have options). dongsheng@623: dongsheng@623: =back dongsheng@623: dongsheng@623: =cut dongsheng@623: sub tag_in_list ($$$) { dongsheng@623: my ($self,$path,$list) = @_; dongsheng@623: if ($self->{options}{'caseinsensitive'}) { dongsheng@623: $path = lc $path; dongsheng@623: } dongsheng@623: dongsheng@623: while (1) { dongsheng@623: if (defined $list->{$path}) { dongsheng@623: if (length $list->{$path}) { dongsheng@623: return $list->{$path}; dongsheng@623: } else { dongsheng@623: return 1; dongsheng@623: } dongsheng@623: } dongsheng@623: last unless ($path =~ m/</); dongsheng@623: $path =~ s/^<.*?>//; dongsheng@623: } dongsheng@623: dongsheng@623: return 0; dongsheng@623: } dongsheng@623: dongsheng@623: =head2 WORKING WITH ATTRIBUTES dongsheng@623: dongsheng@623: =over 4 dongsheng@623: dongsheng@623: =item treat_attributes(@) dongsheng@623: dongsheng@623: This function handles the translation of the tags' attributes. It receives the tag dongsheng@623: without the beginning / end marks, and then it finds the attributes, and it dongsheng@623: translates the translatable ones (specified by the module option "attributes"). dongsheng@623: This returns a plain string with the translated tag. dongsheng@623: dongsheng@623: =back dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub treat_attributes { dongsheng@623: my ($self,@tag)=@_; dongsheng@623: dongsheng@623: $tag[0] =~ /^(\S*)(.*)/s; dongsheng@623: my $text = $1; dongsheng@623: $tag[0] = $2; dongsheng@623: dongsheng@623: while (@tag) { dongsheng@623: my $complete = 1; dongsheng@623: dongsheng@623: $text .= $self->skip_spaces(\@tag); dongsheng@623: if (@tag) { dongsheng@623: # Get the attribute's name dongsheng@623: $complete = 0; dongsheng@623: dongsheng@623: $tag[0] =~ /^([^\s=]+)(.*)/s; dongsheng@623: my $name = $1; dongsheng@623: my $ref = $tag[1]; dongsheng@623: $tag[0] = $2; dongsheng@623: $text .= $name; dongsheng@623: $text .= $self->skip_spaces(\@tag); dongsheng@623: if (@tag) { dongsheng@623: # Get the '=' dongsheng@623: if ($tag[0] =~ /^=(.*)/s) { dongsheng@623: $tag[0] = $1; dongsheng@623: $text .= "="; dongsheng@623: $text .= $self->skip_spaces(\@tag); dongsheng@623: if (@tag) { dongsheng@623: # Get the value dongsheng@623: my $value=""; dongsheng@623: $ref=$tag[1]; dongsheng@623: my $quot=substr($tag[0],0,1); dongsheng@623: if ($quot ne "\"" and $quot ne "'") { dongsheng@623: # Unquoted value dongsheng@623: $quot=""; dongsheng@623: $tag[0] =~ /^(\S+)(.*)/s; dongsheng@623: $value = $1; dongsheng@623: $tag[0] = $2; dongsheng@623: } else { dongsheng@623: # Quoted value dongsheng@623: $text .= $quot; dongsheng@623: $tag[0] =~ /^\Q$quot\E(.*)/s; dongsheng@623: $tag[0] = $1; dongsheng@623: while ($tag[0] !~ /\Q$quot\E/) { dongsheng@623: $value .= $tag[0]; dongsheng@623: shift @tag; dongsheng@623: shift @tag; dongsheng@623: } dongsheng@623: $tag[0] =~ /^(.*?)\Q$quot\E(.*)/s; dongsheng@623: $value .= $1; dongsheng@623: $tag[0] = $2; dongsheng@623: } dongsheng@623: $complete = 1; dongsheng@623: if ($self->tag_in_list($self->get_path.$name,$self->{attributes})) { dongsheng@623: $text .= $self->found_string($value, $ref, { type=>"attribute", attribute=>$name }); dongsheng@623: } else { dongsheng@623: print wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Content of attribute %s excluded: %s"), $self->get_path.$name, $value) dongsheng@623: if $self->debug(); dongsheng@623: $text .= $self->recode_skipped_text($value); dongsheng@623: } dongsheng@623: $text .= $quot; dongsheng@623: } dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: unless ($complete) { dongsheng@623: my $ontagerror = $self->{options}{'ontagerror'}; dongsheng@623: if ($ontagerror eq "warn") { dongsheng@623: warn wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax. Continuing...")); dongsheng@623: } elsif ($ontagerror ne "silent") { dongsheng@623: die wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax")); dongsheng@623: } dongsheng@623: } dongsheng@623: } dongsheng@623: } dongsheng@623: return $text; dongsheng@623: } dongsheng@623: dongsheng@623: # Returns an empty string if the content in the $path should not be dongsheng@623: # translated. dongsheng@623: # dongsheng@623: # Otherwise, returns the set of options for translation: dongsheng@623: # w: the content shall be re-wrapped dongsheng@623: # W: the content shall not be re-wrapped dongsheng@623: # i: the tag shall be inlined dongsheng@623: # p: a placeholder shall replace the tag (and its content) dongsheng@623: # dongsheng@623: # A translatable inline tag in an untranslated tag is treated as a translatable breaking tag. dongsheng@623: my %translate_options_cache; dongsheng@623: sub get_translate_options { dongsheng@623: my $self = shift; dongsheng@623: my $path = shift; dongsheng@623: dongsheng@623: if (defined $translate_options_cache{$path}) { dongsheng@623: return $translate_options_cache{$path}; dongsheng@623: } dongsheng@623: dongsheng@623: my $options = ""; dongsheng@623: my $translate = 0; dongsheng@623: my $usedefault = 1; dongsheng@623: dongsheng@623: my $inlist = 0; dongsheng@623: my $tag = $self->get_tag_from_list($path, $self->{tags}); dongsheng@623: if (defined $tag) { dongsheng@623: $inlist = 1; dongsheng@623: } dongsheng@623: if ($self->{options}{'tagsonly'} eq $inlist) { dongsheng@623: $usedefault = 0; dongsheng@623: if (defined $tag) { dongsheng@623: $options = $tag; dongsheng@623: $options =~ s/<.*$//; dongsheng@623: } else { dongsheng@623: if ($self->{options}{'wrap'}) { dongsheng@623: $options = "w"; dongsheng@623: } else { dongsheng@623: $options = "W"; dongsheng@623: } dongsheng@623: } dongsheng@623: $translate = 1; dongsheng@623: } dongsheng@623: dongsheng@623: # TODO: a less precise set of tags should not override a more precise one dongsheng@623: # The tags and tagsonly options are deprecated. dongsheng@623: # The translated and untranslated options have an higher priority. dongsheng@623: $tag = $self->get_tag_from_list($path, $self->{translated}); dongsheng@623: if (defined $tag) { dongsheng@623: $usedefault = 0; dongsheng@623: $options = $tag; dongsheng@623: $options =~ s/<.*$//; dongsheng@623: $translate = 1; dongsheng@623: } dongsheng@623: dongsheng@623: if ($translate and $options !~ m/w/i) { dongsheng@623: $options .= ($self->{options}{'wrap'})?"w":"W"; dongsheng@623: } dongsheng@623: dongsheng@623: if (not defined $tag) { dongsheng@623: $tag = $self->get_tag_from_list($path, $self->{untranslated}); dongsheng@623: if (defined $tag) { dongsheng@623: $usedefault = 0; dongsheng@623: $options = ""; dongsheng@623: $translate = 0; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: $tag = $self->get_tag_from_list($path, $self->{inline}); dongsheng@623: if (defined $tag) { dongsheng@623: $usedefault = 0; dongsheng@623: $options .= "i"; dongsheng@623: } else { dongsheng@623: $tag = $self->get_tag_from_list($path, $self->{placeholder}); dongsheng@623: if (defined $tag) { dongsheng@623: $usedefault = 0; dongsheng@623: $options .= "p"; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: if ($usedefault) { dongsheng@623: $options = $self->{options}{'defaulttranslateoption'}; dongsheng@623: } dongsheng@623: dongsheng@623: # A translatable inline tag in an untranslated tag is treated as a dongsheng@623: # translatable breaking tag. dongsheng@623: if ($options =~ m/i/) { dongsheng@623: my $ppath = $path; dongsheng@623: $ppath =~ s/<[^>]*>$//; dongsheng@623: my $poptions = $self->get_translate_options ($ppath); dongsheng@623: if ($poptions eq "") { dongsheng@623: $options =~ s/i//; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: if ($options =~ m/i/ and $self->{options}{'foldattributes'}) { dongsheng@623: $options .= "f"; dongsheng@623: } dongsheng@623: dongsheng@623: $translate_options_cache{$path} = $options; dongsheng@623: return $options; dongsheng@623: } dongsheng@623: dongsheng@623: dongsheng@623: # Return the tag (or biggest set of tags) of a list which matches with the dongsheng@623: # given path. dongsheng@623: # dongsheng@623: # The tag (or set of tags) is returned with its options. dongsheng@623: # dongsheng@623: # If no tags could match the path, undef is returned. dongsheng@623: sub get_tag_from_list ($$$) { dongsheng@623: my ($self,$path,$list) = @_; dongsheng@623: if ($self->{options}{'caseinsensitive'}) { dongsheng@623: $path = lc $path; dongsheng@623: } dongsheng@623: dongsheng@623: while (1) { dongsheng@623: if (defined $list->{$path}) { dongsheng@623: return $list->{$path}.$path; dongsheng@623: } dongsheng@623: last unless ($path =~ m/</); dongsheng@623: $path =~ s/^<.*?>//; dongsheng@623: } dongsheng@623: dongsheng@623: return undef; dongsheng@623: } dongsheng@623: dongsheng@623: dongsheng@623: dongsheng@623: sub treat_content { dongsheng@623: my $self = shift; dongsheng@623: my $blank=""; dongsheng@623: # Indicates if the paragraph will have to be translated dongsheng@623: my $translate = ""; dongsheng@623: dongsheng@623: my ($eof,@paragraph)=$self->get_string_until('<',{remove=>1}); dongsheng@623: dongsheng@623: while (!$eof and !$self->breaking_tag) { dongsheng@623: NEXT_TAG: dongsheng@623: my @text; dongsheng@623: my $type = $self->tag_type; dongsheng@623: my $f_extract = $tag_types[$type]->{'f_extract'}; dongsheng@623: if ( defined($f_extract) dongsheng@623: and $f_extract eq \&tag_extract_comment) { dongsheng@623: # Remove the content of the comments dongsheng@623: ($eof, @text) = $self->extract_tag($type,1); dongsheng@623: $text[$#text-1] .= "\0"; dongsheng@623: if ($tag_types[$type]->{'beginning'} eq "!--#") { dongsheng@623: $text[0] = "#".$text[0]; dongsheng@623: } dongsheng@623: push @comments, @text; dongsheng@623: } else { dongsheng@623: my ($tmpeof, @tag) = $self->extract_tag($type,0); dongsheng@623: # Append the found inline tag dongsheng@623: ($eof,@text)=$self->get_string_until('>', dongsheng@623: {include=>1, dongsheng@623: remove=>1, dongsheng@623: unquoted=>1}); dongsheng@623: # Append or remove the opening/closing tag from dongsheng@623: # the tag path dongsheng@623: if ($tag_types[$type]->{'end'} eq "") { dongsheng@623: if ($tag_types[$type]->{'beginning'} eq "") { dongsheng@623: # Opening inline tag dongsheng@623: my $cur_tag_name = $self->get_tag_name(@tag); dongsheng@623: my $t_opts = $self->get_translate_options($self->get_path($cur_tag_name)); dongsheng@623: if ($t_opts =~ m/p/) { dongsheng@623: # We enter a new holder. dongsheng@623: # Append a <placeholder ...> tag to the current dongsheng@623: # paragraph, and save the @paragraph in the dongsheng@623: # current holder. dongsheng@623: my $last_holder = $save_holders[$#save_holders]; dongsheng@623: my $placeholder_str = "<placeholder type=\"".$cur_tag_name."\" id=\"".($#{$last_holder->{'sub_translations'}}+1)."\"/>"; dongsheng@623: push @paragraph, ($placeholder_str, $text[1]); dongsheng@623: my @saved_paragraph = @paragraph; dongsheng@623: dongsheng@623: $last_holder->{'paragraph'} = \@saved_paragraph; dongsheng@623: dongsheng@623: # Then we must push a new holder dongsheng@623: my @new_paragraph = (); dongsheng@623: my @sub_translations = (); dongsheng@623: my %folded_attributes; dongsheng@623: my %new_holder = ('paragraph' => \@new_paragraph, dongsheng@623: 'open' => $text[0], dongsheng@623: 'translation' => "", dongsheng@623: 'close' => undef, dongsheng@623: 'sub_translations' => \@sub_translations, dongsheng@623: 'folded_attributes' => \%folded_attributes); dongsheng@623: push @save_holders, \%new_holder; dongsheng@623: @text = (); dongsheng@623: dongsheng@623: # The current @paragraph dongsheng@623: # (for the current holder) dongsheng@623: # is empty. dongsheng@623: @paragraph = (); dongsheng@623: } elsif ($t_opts =~ m/f/) { dongsheng@623: my $tag_full = $self->join_lines(@text); dongsheng@623: my $tag_ref = $text[1]; dongsheng@623: if ($tag_full =~ m/^<\s*\S+\s+\S.*>$/s) { dongsheng@623: my $holder = $save_holders[$#save_holders]; dongsheng@623: my $id = 0; dongsheng@623: foreach (keys %{$holder->{folded_attributes}}) { dongsheng@623: $id = $_ + 1 if ($_ >= $id); dongsheng@623: } dongsheng@623: $holder->{folded_attributes}->{$id} = $tag_full; dongsheng@623: dongsheng@623: @text = ("<$cur_tag_name po4a-id=$id>", $tag_ref); dongsheng@623: } dongsheng@623: } dongsheng@623: push @path, $cur_tag_name; dongsheng@623: } elsif ($tag_types[$type]->{'beginning'} eq "/") { dongsheng@623: # Closing inline tag dongsheng@623: dongsheng@623: # Check if this is closing the dongsheng@623: # last opening tag we detected. dongsheng@623: my $test = pop @path; dongsheng@623: my $name = $self->get_tag_name(@tag); dongsheng@623: if (!defined($test) || dongsheng@623: $test ne $name ) { dongsheng@623: my $ontagerror = $self->{options}{'ontagerror'}; dongsheng@623: if ($ontagerror eq "warn") { dongsheng@623: warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong. Continuing..."), $name); dongsheng@623: } elsif ($ontagerror ne "silent") { dongsheng@623: die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name); dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: if ($self->get_translate_options($self->get_path($self->get_tag_name(@tag))) =~ m/p/) { dongsheng@623: # This closes the current holder. dongsheng@623: dongsheng@623: push @path, $self->get_tag_name(@tag); dongsheng@623: # Now translate this paragraph if needed. dongsheng@623: # This will call pushline and append the dongsheng@623: # translation to the current holder's translation. dongsheng@623: $self->translate_paragraph(@paragraph); dongsheng@623: pop @path; dongsheng@623: dongsheng@623: # Now that this holder is closed, we can remove dongsheng@623: # the holder from the stack. dongsheng@623: my $holder = pop @save_holders; dongsheng@623: # We need to keep the translation of this holder dongsheng@623: my $translation = $holder->{'open'}.$holder->{'translation'}.$text[0]; dongsheng@623: # FIXME: @text could be multilines. dongsheng@623: dongsheng@623: @text = (); dongsheng@623: dongsheng@623: # Then we store the translation in the previous dongsheng@623: # holder's sub_translations array dongsheng@623: my $previous_holder = $save_holders[$#save_holders]; dongsheng@623: push @{$previous_holder->{'sub_translations'}}, $translation; dongsheng@623: # We also need to restore the @paragraph array, as dongsheng@623: # it was before we encountered the holder. dongsheng@623: @paragraph = @{$previous_holder->{'paragraph'}}; dongsheng@623: } dongsheng@623: } dongsheng@623: } dongsheng@623: push @paragraph, @text; dongsheng@623: } dongsheng@623: dongsheng@623: # Next tag dongsheng@623: ($eof,@text)=$self->get_string_until('<',{remove=>1}); dongsheng@623: if ($#text > 0) { dongsheng@623: # Check if text (extracted after the inline tag) dongsheng@623: # has to be translated dongsheng@623: push @paragraph, @text; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: # This strips the extracted strings dongsheng@623: # (only if you don't specify the 'nostrip' option, and if the dongsheng@623: # paragraph can be re-wrapped) dongsheng@623: $translate = $self->get_translate_options($self->get_path); dongsheng@623: if (!$self->{options}{'nostrip'} and $translate !~ m/W/) { dongsheng@623: my $clean = 0; dongsheng@623: # Clean the beginning dongsheng@623: while (!$clean and $#paragraph > 0) { dongsheng@623: $paragraph[0] =~ /^(\s*)(.*)/s; dongsheng@623: my $match = $1; dongsheng@623: if ($paragraph[0] eq $match) { dongsheng@623: if ($match ne "") { dongsheng@623: $self->pushline($match); dongsheng@623: } dongsheng@623: shift @paragraph; dongsheng@623: shift @paragraph; dongsheng@623: } else { dongsheng@623: $paragraph[0] = $2; dongsheng@623: if ($match ne "") { dongsheng@623: $self->pushline($match); dongsheng@623: } dongsheng@623: $clean = 1; dongsheng@623: } dongsheng@623: } dongsheng@623: $clean = 0; dongsheng@623: # Clean the end dongsheng@623: while (!$clean and $#paragraph > 0) { dongsheng@623: $paragraph[$#paragraph-1] =~ /^(.*?)(\s*)$/s; dongsheng@623: my $match = $2; dongsheng@623: if ($paragraph[$#paragraph-1] eq $match) { dongsheng@623: if ($match ne "") { dongsheng@623: $blank = $match.$blank; dongsheng@623: } dongsheng@623: pop @paragraph; dongsheng@623: pop @paragraph; dongsheng@623: } else { dongsheng@623: $paragraph[$#paragraph-1] = $1; dongsheng@623: if ($match ne "") { dongsheng@623: $blank = $match.$blank; dongsheng@623: } dongsheng@623: $clean = 1; dongsheng@623: } dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: # Translate the string when needed dongsheng@623: # This will either push the translation in the translated document or dongsheng@623: # in the current holder translation. dongsheng@623: $self->translate_paragraph(@paragraph); dongsheng@623: dongsheng@623: # Push the trailing blanks dongsheng@623: if ($blank ne "") { dongsheng@623: $self->pushline($blank); dongsheng@623: } dongsheng@623: return $eof; dongsheng@623: } dongsheng@623: dongsheng@623: # Translate a @paragraph array of (string, reference). dongsheng@623: # The $translate argument indicates if the strings must be translated or dongsheng@623: # just pushed dongsheng@623: sub translate_paragraph { dongsheng@623: my $self = shift; dongsheng@623: my @paragraph = @_; dongsheng@623: my $translate = $self->get_translate_options($self->get_path); dongsheng@623: dongsheng@623: while ( (scalar @paragraph) dongsheng@623: and ($paragraph[0] =~ m/^\s*\n/s)) { dongsheng@623: $self->pushline($paragraph[0]); dongsheng@623: shift @paragraph; dongsheng@623: shift @paragraph; dongsheng@623: } dongsheng@623: dongsheng@623: my $comments; dongsheng@623: while (@comments) { dongsheng@623: my ($comment,$eoc); dongsheng@623: do { dongsheng@623: my ($t,$l) = (shift @comments, shift @comments); dongsheng@623: $t =~ s/\n?(\0)?$//; dongsheng@623: $eoc = $1; dongsheng@623: $comment .= "\n" if defined $comment; dongsheng@623: $comment .= $t; dongsheng@623: } until ($eoc); dongsheng@623: $comments .= "\n" if defined $comments; dongsheng@623: $comments .= $comment; dongsheng@623: $self->pushline("<!--".$comment."-->\n") if defined $comment; dongsheng@623: } dongsheng@623: @comments = (); dongsheng@623: dongsheng@623: if ($self->{options}{'cpp'}) { dongsheng@623: my @tmp = @paragraph; dongsheng@623: @paragraph = (); dongsheng@623: while (@tmp) { dongsheng@623: my ($t,$l) = (shift @tmp, shift @tmp); dongsheng@623: # #include can be followed by a filename between dongsheng@623: # <> brackets. In that case, the argument won't be dongsheng@623: # handled in the same call to translate_paragraph. dongsheng@623: # Thus do not try to match "include ". dongsheng@623: if ($t =~ m/^#[ \t]*(if |endif|undef |include|else|ifdef |ifndef |define )/si) { dongsheng@623: if (@paragraph) { dongsheng@623: $self->translate_paragraph(@paragraph); dongsheng@623: @paragraph = (); dongsheng@623: $self->pushline("\n"); dongsheng@623: } dongsheng@623: $self->pushline($t); dongsheng@623: } else { dongsheng@623: push @paragraph, ($t,$l); dongsheng@623: } dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: my $para = $self->join_lines(@paragraph); dongsheng@623: if ( length($para) > 0 ) { dongsheng@623: if ($translate ne "") { dongsheng@623: # This tag should be translated dongsheng@623: $self->pushline($self->found_string( dongsheng@623: $para, dongsheng@623: $paragraph[1], { dongsheng@623: type=>"tag", dongsheng@623: tag_options=>$translate, dongsheng@623: comments=>$comments dongsheng@623: })); dongsheng@623: } else { dongsheng@623: # Inform that this tag isn't translated in debug mode dongsheng@623: print wrap_ref_mod($paragraph[1], "po4a::xml", dgettext ("po4a", "Content of tag %s excluded: %s"), $self->get_path, $para) dongsheng@623: if $self->debug(); dongsheng@623: $self->pushline($self->recode_skipped_text($para)); dongsheng@623: } dongsheng@623: } dongsheng@623: # Now the paragraph is fully translated. dongsheng@623: # If we have all the holders' translation, we can replace the dongsheng@623: # placeholders by their translations. dongsheng@623: # We must wait to have all the translations because the holders are dongsheng@623: # numbered. dongsheng@623: { dongsheng@623: my $holder = $save_holders[$#save_holders]; dongsheng@623: my $translation = $holder->{'translation'}; dongsheng@623: dongsheng@623: # Count the number of <placeholder ...> in $translation dongsheng@623: my $count = 0; dongsheng@623: my $str = $translation; dongsheng@623: while ( (defined $str) dongsheng@623: and ($str =~ m/^.*?<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s)) { dongsheng@623: $count += 1; dongsheng@623: $str = $2; dongsheng@623: if ($holder->{'sub_translations'}->[$1] =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s) { dongsheng@623: $count = -1; dongsheng@623: last; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: if ( (defined $translation) dongsheng@623: and (scalar(@{$holder->{'sub_translations'}}) == $count)) { dongsheng@623: # OK, all the holders of the current paragraph are dongsheng@623: # closed (and translated). dongsheng@623: # Replace them by their translation. dongsheng@623: while ($translation =~ m/^(.*?)<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s) { dongsheng@623: # FIXME: we could also check that dongsheng@623: # * the holder exists dongsheng@623: # * all the holders are used dongsheng@623: $translation = $1.$holder->{'sub_translations'}->[$2].$3; dongsheng@623: } dongsheng@623: # We have our translation dongsheng@623: $holder->{'translation'} = $translation; dongsheng@623: # And there is no need for any holder in it. dongsheng@623: my @sub_translations = (); dongsheng@623: $holder->{'sub_translations'} = \@sub_translations; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: } dongsheng@623: dongsheng@623: dongsheng@623: dongsheng@623: =head2 WORKING WITH THE MODULE OPTIONS dongsheng@623: dongsheng@623: =over 4 dongsheng@623: dongsheng@623: =item treat_options() dongsheng@623: dongsheng@623: This function fills the internal structures that contain the tags, attributes dongsheng@623: and inline data with the options of the module (specified in the command-line dongsheng@623: or in the initialize function). dongsheng@623: dongsheng@623: =back dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub treat_options { dongsheng@623: my $self = shift; dongsheng@623: dongsheng@623: if ($self->{options}{'caseinsensitive'}) { dongsheng@623: $self->{options}{'nodefault'} = lc $self->{options}{'nodefault'}; dongsheng@623: $self->{options}{'tags'} = lc $self->{options}{'tags'}; dongsheng@623: $self->{options}{'break'} = lc $self->{options}{'break'}; dongsheng@623: $self->{options}{'_default_break'} = lc $self->{options}{'_default_break'}; dongsheng@623: $self->{options}{'translated'} = lc $self->{options}{'translated'}; dongsheng@623: $self->{options}{'_default_translated'} = lc $self->{options}{'_default_translated'}; dongsheng@623: $self->{options}{'untranslated'} = lc $self->{options}{'untranslated'}; dongsheng@623: $self->{options}{'_default_untranslated'} = lc $self->{options}{'_default_untranslated'}; dongsheng@623: $self->{options}{'attributes'} = lc $self->{options}{'attributes'}; dongsheng@623: $self->{options}{'_default_attributes'} = lc $self->{options}{'_default_attributes'}; dongsheng@623: $self->{options}{'inline'} = lc $self->{options}{'inline'}; dongsheng@623: $self->{options}{'_default_inline'} = lc $self->{options}{'_default_inline'}; dongsheng@623: $self->{options}{'placeholder'} = lc $self->{options}{'placeholder'}; dongsheng@623: $self->{options}{'_default_placeholder'} = lc $self->{options}{'_default_placeholder'}; dongsheng@623: } dongsheng@623: dongsheng@623: $self->{options}{'nodefault'} =~ /^\s*(.*)\s*$/s; dongsheng@623: my %list_nodefault; dongsheng@623: foreach (split(/\s+/s,$1)) { dongsheng@623: $list_nodefault{$_} = 1; dongsheng@623: } dongsheng@623: $self->{nodefault} = \%list_nodefault; dongsheng@623: dongsheng@623: $self->{options}{'tags'} =~ /^\s*(.*)\s*$/s; dongsheng@623: if (length $self->{options}{'tags'}) { dongsheng@623: warn wrap_mod("po4a::xml", dongsheng@623: dgettext("po4a", dongsheng@623: "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tags"); dongsheng@623: } dongsheng@623: foreach (split(/\s+/s,$1)) { dongsheng@623: $_ =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{tags}->{$2} = $1 || ""; dongsheng@623: } dongsheng@623: dongsheng@623: if ($self->{options}{'tagsonly'}) { dongsheng@623: warn wrap_mod("po4a::xml", dongsheng@623: dgettext("po4a", dongsheng@623: "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tagsonly"); dongsheng@623: } dongsheng@623: dongsheng@623: $self->{options}{'break'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: $tag =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{break}->{$2} = $1 || ""; dongsheng@623: } dongsheng@623: $self->{options}{'_default_break'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: $tag =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{break}->{$2} = $1 || "" dongsheng@623: unless $list_nodefault{$2} dongsheng@623: or defined $self->{break}->{$2}; dongsheng@623: } dongsheng@623: dongsheng@623: $self->{options}{'translated'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: $tag =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{translated}->{$2} = $1 || ""; dongsheng@623: } dongsheng@623: $self->{options}{'_default_translated'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: $tag =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{translated}->{$2} = $1 || "" dongsheng@623: unless $list_nodefault{$2} dongsheng@623: or defined $self->{translated}->{$2}; dongsheng@623: } dongsheng@623: dongsheng@623: $self->{options}{'untranslated'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: $tag =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{untranslated}->{$2} = $1 || ""; dongsheng@623: } dongsheng@623: $self->{options}{'_default_untranslated'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: $tag =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{untranslated}->{$2} = $1 || "" dongsheng@623: unless $list_nodefault{$2} dongsheng@623: or defined $self->{untranslated}->{$2}; dongsheng@623: } dongsheng@623: dongsheng@623: $self->{options}{'attributes'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: if ($tag =~ m/^(.*?)(<.*)$/) { dongsheng@623: $self->{attributes}->{$2} = $1 || ""; dongsheng@623: } else { dongsheng@623: $self->{attributes}->{$tag} = ""; dongsheng@623: } dongsheng@623: } dongsheng@623: $self->{options}{'_default_attributes'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: if ($tag =~ m/^(.*?)(<.*)$/) { dongsheng@623: $self->{attributes}->{$2} = $1 || "" dongsheng@623: unless $list_nodefault{$2} dongsheng@623: or defined $self->{attributes}->{$2}; dongsheng@623: } else { dongsheng@623: $self->{attributes}->{$tag} = "" dongsheng@623: unless $list_nodefault{$tag} dongsheng@623: or defined $self->{attributes}->{$tag}; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: my @list_inline; dongsheng@623: $self->{options}{'inline'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: $tag =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{inline}->{$2} = $1 || ""; dongsheng@623: } dongsheng@623: $self->{options}{'_default_inline'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: $tag =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{inline}->{$2} = $1 || "" dongsheng@623: unless $list_nodefault{$2} dongsheng@623: or defined $self->{inline}->{$2}; dongsheng@623: } dongsheng@623: dongsheng@623: $self->{options}{'placeholder'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: $tag =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{placeholder}->{$2} = $1 || ""; dongsheng@623: } dongsheng@623: $self->{options}{'_default_placeholder'} =~ /^\s*(.*)\s*$/s; dongsheng@623: foreach my $tag (split(/\s+/s,$1)) { dongsheng@623: $tag =~ m/^(.*?)(<.*)$/; dongsheng@623: $self->{placeholder}->{$2} = $1 || "" dongsheng@623: unless $list_nodefault{$2} dongsheng@623: or defined $self->{placeholder}->{$2}; dongsheng@623: } dongsheng@623: dongsheng@623: # There should be no translated and untranslated tags dongsheng@623: foreach my $tag (keys %{$self->{translated}}) { dongsheng@623: die wrap_mod("po4a::xml", dongsheng@623: dgettext("po4a", dongsheng@623: "Tag '%s' both in the %s and %s categories."), $tag, "translated", "untranslated") dongsheng@623: if defined $self->{untranslated}->{$tag}; dongsheng@623: } dongsheng@623: # There should be no inline, break, and placeholder tags dongsheng@623: foreach my $tag (keys %{$self->{inline}}) { dongsheng@623: die wrap_mod("po4a::xml", dongsheng@623: dgettext("po4a", dongsheng@623: "Tag '%s' both in the %s and %s categories."), $tag, "inline", "break") dongsheng@623: if defined $self->{break}->{$tag}; dongsheng@623: die wrap_mod("po4a::xml", dongsheng@623: dgettext("po4a", dongsheng@623: "Tag '%s' both in the %s and %s categories."), $tag, "inline", "placeholder") dongsheng@623: if defined $self->{placeholder}->{$tag}; dongsheng@623: } dongsheng@623: foreach my $tag (keys %{$self->{break}}) { dongsheng@623: die wrap_mod("po4a::xml", dongsheng@623: dgettext("po4a", dongsheng@623: "Tag '%s' both in the %s and %s categories."), $tag, "break", "placeholder") dongsheng@623: if defined $self->{placeholder}->{$tag}; dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: =head2 GETTING TEXT FROM THE INPUT DOCUMENT dongsheng@623: dongsheng@623: =over dongsheng@623: dongsheng@623: =item get_string_until($%) dongsheng@623: dongsheng@623: This function returns an array with the lines (and references) from the input dongsheng@623: document until it finds the first argument. The second argument is an options dongsheng@623: hash. Value 0 means disabled (the default) and 1, enabled. dongsheng@623: dongsheng@623: The valid options are: dongsheng@623: dongsheng@623: =over 4 dongsheng@623: dongsheng@623: =item include dongsheng@623: dongsheng@623: This makes the returned array to contain the searched text dongsheng@623: dongsheng@623: =item remove dongsheng@623: dongsheng@623: This removes the returned stream from the input dongsheng@623: dongsheng@623: =item unquoted dongsheng@623: dongsheng@623: This ensures that the searched text is outside any quotes dongsheng@623: dongsheng@623: =back dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub get_string_until { dongsheng@623: my ($self,$search) = (shift,shift); dongsheng@623: my $options = shift; dongsheng@623: my ($include,$remove,$unquoted, $regex) = (0,0,0,0); dongsheng@623: dongsheng@623: if (defined($options->{include})) { $include = $options->{include}; } dongsheng@623: if (defined($options->{remove})) { $remove = $options->{remove}; } dongsheng@623: if (defined($options->{unquoted})) { $unquoted = $options->{unquoted}; } dongsheng@623: if (defined($options->{regex})) { $regex = $options->{regex}; } dongsheng@623: dongsheng@623: my ($line,$ref) = $self->shiftline(); dongsheng@623: my (@text,$paragraph); dongsheng@623: my ($eof,$found) = (0,0); dongsheng@623: dongsheng@623: $search = "\Q$search\E" unless $regex; dongsheng@623: while (defined($line) and !$found) { dongsheng@623: push @text, ($line,$ref); dongsheng@623: $paragraph .= $line; dongsheng@623: if ($unquoted) { dongsheng@623: if ( $paragraph =~ /^((\".*?\")|(\'.*?\')|[^\"\'])*$search/s ) { dongsheng@623: $found = 1; dongsheng@623: } dongsheng@623: } else { dongsheng@623: if ( $paragraph =~ /$search/s ) { dongsheng@623: $found = 1; dongsheng@623: } dongsheng@623: } dongsheng@623: if (!$found) { dongsheng@623: ($line,$ref)=$self->shiftline(); dongsheng@623: } dongsheng@623: } dongsheng@623: dongsheng@623: if (!defined($line)) { $eof = 1; } dongsheng@623: dongsheng@623: if ( $found ) { dongsheng@623: $line = ""; dongsheng@623: if($unquoted) { dongsheng@623: $paragraph =~ /^(?:(?:\".*?\")|(?:\'.*?\')|[^\"\'])*?$search(.*)$/s; dongsheng@623: $line = $1; dongsheng@623: $text[$#text-1] =~ s/\Q$line\E$//s; dongsheng@623: } else { dongsheng@623: $paragraph =~ /$search(.*)$/s; dongsheng@623: $line = $1; dongsheng@623: $text[$#text-1] =~ s/\Q$line\E$//s; dongsheng@623: } dongsheng@623: if(!$include) { dongsheng@623: $text[$#text-1] =~ /^(.*)($search.*)$/s; dongsheng@623: $text[$#text-1] = $1; dongsheng@623: $line = $2.$line; dongsheng@623: } dongsheng@623: if (defined($line) and ($line ne "")) { dongsheng@623: $self->unshiftline ($line,$text[$#text]); dongsheng@623: } dongsheng@623: } dongsheng@623: if (!$remove) { dongsheng@623: $self->unshiftline (@text); dongsheng@623: } dongsheng@623: dongsheng@623: #If we get to the end of the file, we return the whole paragraph dongsheng@623: return ($eof,@text); dongsheng@623: } dongsheng@623: dongsheng@623: =item skip_spaces(\@) dongsheng@623: dongsheng@623: This function receives as argument the reference to a paragraph (in the format dongsheng@623: returned by get_string_until), skips his heading spaces and returns them as dongsheng@623: a simple string. dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub skip_spaces { dongsheng@623: my ($self,$pstring)=@_; dongsheng@623: my $space=""; dongsheng@623: dongsheng@623: while (@$pstring and (@$pstring[0] =~ /^(\s+)(.*)$/s or @$pstring[0] eq "")) { dongsheng@623: if (@$pstring[0] ne "") { dongsheng@623: $space .= $1; dongsheng@623: @$pstring[0] = $2; dongsheng@623: } dongsheng@623: dongsheng@623: if (@$pstring[0] eq "") { dongsheng@623: shift @$pstring; dongsheng@623: shift @$pstring; dongsheng@623: } dongsheng@623: } dongsheng@623: return $space; dongsheng@623: } dongsheng@623: dongsheng@623: =item join_lines(@) dongsheng@623: dongsheng@623: This function returns a simple string with the text from the argument array dongsheng@623: (discarding the references). dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: sub join_lines { dongsheng@623: my ($self,@lines)=@_; dongsheng@623: my ($line,$ref); dongsheng@623: my $text = ""; dongsheng@623: while ($#lines > 0) { dongsheng@623: ($line,$ref) = (shift @lines,shift @lines); dongsheng@623: $text .= $line; dongsheng@623: } dongsheng@623: return $text; dongsheng@623: } dongsheng@623: dongsheng@623: =back dongsheng@623: dongsheng@623: =head1 STATUS OF THIS MODULE dongsheng@623: dongsheng@623: This module can translate tags and attributes. dongsheng@623: dongsheng@623: =head1 TODO LIST dongsheng@623: dongsheng@623: DOCTYPE (ENTITIES) dongsheng@623: dongsheng@623: There is a minimal support for the translation of entities. They are dongsheng@623: translated as a whole, and tags are not taken into account. Multilines dongsheng@623: entities are not supported and entities are always rewrapped during the dongsheng@623: translation. dongsheng@623: dongsheng@623: MODIFY TAG TYPES FROM INHERITED MODULES dongsheng@623: (move the tag_types structure inside the $self hash?) dongsheng@623: dongsheng@623: =head1 SEE ALSO dongsheng@623: dongsheng@623: L<po4a(7)|po4a.7>, L<Locale::Po4a::TransTractor(3pm)|Locale::Po4a::TransTractor>. dongsheng@623: dongsheng@623: =head1 AUTHORS dongsheng@623: dongsheng@623: Jordi Vilalta <jvprat@gmail.com> dongsheng@623: Nicolas François <nicolas.francois@centraliens.net> dongsheng@623: dongsheng@623: =head1 COPYRIGHT AND LICENSE dongsheng@623: dongsheng@623: Copyright (c) 2004 by Jordi Vilalta <jvprat@gmail.com> dongsheng@623: Copyright (c) 2008-2009 by Nicolas François <nicolas.francois@centraliens.net> dongsheng@623: dongsheng@623: This program is free software; you may redistribute it and/or modify it dongsheng@623: under the terms of GPL (see the COPYING file). dongsheng@623: dongsheng@623: =cut dongsheng@623: dongsheng@623: 1;