/usr/local/CPAN/LaTeX-TOM/LaTeX/TOM/Parser.pm
###############################################################################
#
# LaTeX::TOM::Parser
#
# The parsing class
#
###############################################################################
package LaTeX::TOM::Parser;
use strict;
use base qw(
LaTeX::TOM::Node
LaTeX::TOM::Tree
);
our $VERSION = '0.01';
# Constructor
#
sub new {
my $class = shift;
my $self = {};
bless $self, ref($class) || $class;
$self->_init(@_);
return $self;
}
# Set/reset "globals"
#
sub _init {
my $parser = shift;
my ($parse_errors_fatal, $readinputs, $applymappings) = @_;
my $retrieve_opt_with_default = sub
{
my ($opt, $default) = @_;
return $opt if defined $opt;
return $default;
};
# set user options
#
$parser->{readinputs} = $retrieve_opt_with_default->($readinputs, 0);
$parser->{applymappings} = $retrieve_opt_with_default->($applymappings, 0);
$parser->{PARSE_ERRORS_FATAL} = $retrieve_opt_with_default->($parse_errors_fatal, $parser->{config}{PARSE_ERRORS_FATAL});
# init internal stuff
#
$parser->{MATHBRACKETS} = $parser->{config}{MATHBRACKETS};
# this will hold a running list/hash of commands that have been remapped
$parser->{MAPPEDCMDS} = {};
# this will hold a running list/hash of commands that have been used. We dont
# bother apply mappings except to commands that have been used.
$parser->{USED_COMMANDS} = {};
# no file yet
$parser->{file} = undef;
}
# Parse a LaTeX file, return a tree. You probably want this method.
#
sub parseFile {
my $parser = shift;
my $filename = shift;
# init variables
#
$parser->{file} = $filename; # file name member data
my $tree = {}; # init output tree
# read in text from file
#
my $text = _readFile($filename);
# do the parse or bomb out
#
if ($text) {
$tree = $parser->parse($text);
} else {
die "Could not read file $filename !";
}
return $tree;
}
# main parsing entrypoint
#
sub parse {
my $parser = shift;
my $text = shift;
# first half of parsing (goes up to finding commands, reading inputs)
#
my ($tree, $bracehash) = $parser->_parseA($text);
#print "done with _parseA\n";
#$tree->print();
# handle mappings
#
$parser->_applyMappings($tree) if $parser->{applymappings};
#print "done with _applyMappings\n";
#$tree->print();
# second half of parsing (environments)
#
$parser->_parseB($tree);
#print "done with _parseB\n";
#$tree->print();
# once all the above is done we can propegate math/plaintext modes down
#
$parser->_propegateModes($tree, 0, 0); # math = 0, plaintext = 0
#print "done with _propegateModes\n";
#$tree->print();
# handle kooky \[ \] math mode
#
if (not exists $parser->{MAPPEDCMDS}->{'\\['}) {
# math mode (\[ \], \( \))
$parser->_stage5($tree, {'\\[' => '\\]', '\\(' => '\\)'}, 1);
$parser->_propegateModes($tree, 0, 0); # have to do this again of course
$parser->{MATHBRACKETS}->{'\\['} = '\\]'; # put back in brackets list for
$parser->{MATHBRACKETS}->{'\\('} = '\\)'; # printing purposes.
}
#$tree->print();
$tree->listify; # add linked-list stuff
return $tree;
}
# Parsing with no mappings and no externally accessible parser object.
#
sub _basicparse {
my $parser = shift; # @_ would break code
my $text = shift;
my $parse_errors_fatal = (defined $_[0] ? $_[0] : $parser->{config}{PARSE_ERRORS_FATAL});
my $readinputs = (defined $_[1] ? $_[1] : 1);
$parser = LaTeX::TOM::Parser->new($parse_errors_fatal, $readinputs);
my ($tree, $bracehash) = $parser->_parseA($text);
$parser->_parseB($tree);
$tree->listify; # add linked-list stuff
return ($tree, $bracehash);
}
# start the tree. separate out comment and text nodes.
#
sub _stage1 {
my $parser = shift;
my $text = shift;
my @nodes = _getTextAndCommentNodes($text, 0, length($text));
return LaTeX::TOM::Tree->new([@nodes], $parser);
}
# this stage parses the braces ({}) and adds the corresponding structure to
# the tree.
#
sub _stage2 {
my $parser = shift;
my $tree = shift;
my $bracehash = shift || undef;
my $startidx = shift || 0; # last two params for starting at some specific
my $startpos = shift || 0; # node and offset.
my %blankhash;
if (not defined $bracehash) {
$bracehash = {%blankhash};
}
my $leftidx = -1;
my $leftpos = -1;
my $leftcount = 0;
# loop through the nodes
for (my $i = $startidx; $i < @{$tree->{nodes}}; $i++) {
my $node = $tree->{nodes}[$i];
my $spos = $node->{start}; # get text start position
# set position placeholder within the text block
my $pos = ($i == $startidx) ? $startpos : 0;
if ($node->{type} eq 'TEXT') {
#warn "parseStage2: looking at text node: [$node->{content}]";
my ($nextpos, $brace) = _findbrace($node->{content}, $pos);
while ($nextpos != -1) {
$pos = $nextpos + 1; # update position pointer
# handle left brace
if ($brace eq '{') {
#warn "found '{' at position $nextpos, leftcount is $leftcount";
if ($leftcount == 0) {
$leftpos = $nextpos;
$leftidx = $i
}
$leftcount++;
}
# handle right brance
elsif ($brace eq '}') {
#warn "found '}' at position $nextpos , leftcount is $leftcount";
my $rightpos = $nextpos;
$leftcount--;
# found the corresponding right brace to our starting left brace
if ($leftcount == 0) {
# see if we have to split the text node into 3 parts
#
if ($leftidx == $i) {
my ($leftside, $textnode3) = $node->split($rightpos, $rightpos);
my ($textnode1, $textnode2) = $leftside->split($leftpos, $leftpos);
# make the new GROUP node
my $groupnode = LaTeX::TOM::Node->new(
{type => 'GROUP',
start => $textnode2->{start} - 1,
end => $textnode2->{end} + 1,
children => LaTeX::TOM::Tree->new([$textnode2], $parser),
});
# splice the new subtree into the old location
splice @{$tree->{nodes}}, $i, 1, $textnode1, $groupnode, $textnode3;
# add to the brace-pair lookup table
$bracehash->{$groupnode->{start}} = $groupnode->{end};
$bracehash->{$groupnode->{end}} = $groupnode->{start};
# recur into new child node
$parser->_stage2($groupnode->{children}, $bracehash);
$i++; # skip to textnode3 for further processing
}
# split across nodes
#
else {
my ($textnode1, $textnode2) = $tree->{nodes}[$leftidx]->split($leftpos, $leftpos);
my ($textnode3, $textnode4) = $node->split($rightpos, $rightpos);
# remove nodes in between the node we found '{' in and the node
# we found '}' in
#
my @removed = splice @{$tree->{nodes}}, $leftidx+1, $i-$leftidx-1;
# create a group node that contains the text after the left brace,
# then all the nodes up until the next text node, then the text
# before the right brace.
#
my $groupnode = LaTeX::TOM::Node->new(
{type => 'GROUP',
start => $textnode2->{start} - 1,
end => $textnode3->{end} + 1,
children => LaTeX::TOM::Tree->new(
[$textnode2,
@removed,
$textnode3],
$parser),
});
# replace the two original text nodes with the leftover left and
# right portions, as well as the group node with everything in
# the middle.
#
splice @{$tree->{nodes}}, $leftidx, 2, $textnode1, $groupnode, $textnode4;
# add to the brace-pair lookup table
$bracehash->{$groupnode->{start}} = $groupnode->{end};
$bracehash->{$groupnode->{end}} = $groupnode->{start};
# recur into new child nodes
$parser->_stage2($groupnode->{children}, $bracehash);
# step back to textnode4 on this level for further processing
$i -= scalar @removed;
}
$leftpos = -1; # reset left data
$leftidx = -1;
last;
} # $leftcount == 0
# check for '}'-based error
#
if ($leftcount < 0) {
if ($parser->{PARSE_ERRORS_FATAL} == 1) {
die "parse error: '}' before '{' at ".($spos+$rightpos).".";
} elsif ($parser->{PARSE_ERRORS_FATAL} == 0) {
warn "parse error: '}' before '{' at ".($spos+$rightpos).".";
}
# reset and continue
$leftcount = 0;
}
} # right brace
($nextpos, $brace) = _findbrace($node->{content}, $pos);
} # while (braces left)
} # if TEXT
} # loop over all nodes
# check for extra '{' parse error
#
if ($leftcount > 0) {
my $spos = $tree->{nodes}[$leftidx]->{start}; # get text start position
if ($parser->{PARSE_ERRORS_FATAL} == 1) {
die "parse error: unmatched '{' at ".($spos+$leftpos).".";
} elsif ($parser->{PARSE_ERRORS_FATAL} == 0) {
warn "parse error: unmatched '{' at ".($spos+$leftpos).".";
}
# try to continue on, after the offending brace
$parser->_stage2($tree, $bracehash, $leftidx, $leftpos + 1);
}
return $bracehash;
}
# this stage finds LaTeX commands and accordingly turns GROUP nodes into
# command nodes, labeled with the command
#
sub _stage3 {
my $parser = shift;
my $tree = shift;
my $parent = shift;
for (my $i = 0; $i< @{$tree->{nodes}}; $i++) {
my $node = $tree->{nodes}[$i];
# check text node for command tag
if ($node->{type} eq 'TEXT') {
my $text = $node->{content};
# inner command (such as {\command text text}). our regexp checks to see
# if this text chunk begins with \command, since that would be the case
# due to the previous parsing stages. if found, the parent node is
# promoted to a command.
#
if ($text =~ /^\s*\\(\w+\*?)/ && defined $parent && $parser->{config}{INNERCMDS}->{$1}) {
my $command = $1;
# if the parent is already a command node, we have to make a new
# nested command node
#
if ($parent->{type} eq 'COMMAND') {
# make a new command node
my $newnode = LaTeX::TOM::Node->new(
{type => 'COMMAND',
command => $command,
start => $parent->{start},
end => $parent->{end},
position => 'inner',
children => $parent->{children} });
# point parent to it
$parent->{children} = LaTeX::TOM::Tree->new([$newnode], $parser);
# start over at this level (get additional inner commands)
$parent = $newnode;
$i = -1;
$parser->{USED_COMMANDS}->{$newnode->{command}} = 1;
}
# parent is a naked group, we can make it into a command node
#
elsif ($parent->{type} eq 'GROUP') {
$parent->{type} = 'COMMAND';
$parent->{command} = $command;
$parent->{position} = 'inner';
# start over at this level
$i = -1;
$parser->{USED_COMMANDS}->{$parent->{command}} = 1;
}
$node->{content} =~ s/^\s*\\(?:\w+\*?)//o;
}
# outer command (such as \command{parameters}). our regexp checks to
# see if this text chunk ends in \command, since that would be the case
# due to the previous parsing stages.
#
if ($text =~ /(?:^|[^\\])(\\\w+\*?(\s*\[.*?\])?)\s*$/os &&
defined $tree->{nodes}[$i+1] &&
$tree->{nodes}[$i+1]->{type} eq 'GROUP') {
my $tag = $1;
#print "found text node [$text] with command tag [$tag]\n";
# remove the text
$node->{content} =~ s/\\\w+\*?\s*(?:\[.*?\])?\s*$//os;
# parse it for command and ops
$tag =~ /^\\(\w+\*?)\s*(?:\[(.*?)\])?$/os;
my $command = $1;
my $opts = $2;
# make the next node a command node with the above data
my $next = $tree->{nodes}[$i+1];
$next->{type} = 'COMMAND';
$next->{command} = $command;
$next->{opts} = $opts;
$next->{position} = 'outer';
$parser->{USED_COMMANDS}->{$next->{command}} = 1;
}
# recognize braceless commands
#
if ($text =~ /(\\(\w+\*?)[ \t]+(\S+))/gso || $text =~ /(\\(\w+)(\d+))/gso) {
my $all = $1;
my $command = $2;
my $param = $3;
if ($parser->{config}{BRACELESS}->{$command}) {
# warn "found braceless command $command with param $param";
# get location to split from node text
my $a = index $node->{content}, $all, 0;
my $b = $a + length($all) - 1;
# make all the new nodes
# new left and right text nodes
my ($leftnode, $rightnode) = $node->split($a, $b);
# param contents node
my $pstart = index $node->{content}, $param, $a;
my $newchild = LaTeX::TOM::Node->new(
{type => 'TEXT',
start => $node->{start} + $pstart,
end => $node->{start} + $pstart + length($param) - 1,
content => $param });
# new command node
my $commandnode = LaTeX::TOM::Node->new(
{type => 'COMMAND',
braces => 0,
command => $command,
start => $node->{start} + $a,
end => $node->{start} + $b,
children => LaTeX::TOM::Tree->new([$newchild], $parser),
});
$parser->{USED_COMMANDS}->{$commandnode->{command}} = 1;
# splice these all into the original array
splice @{$tree->{nodes}}, $i, 1, $leftnode, $commandnode, $rightnode;
# make the rightnode the node we're currently analyzing
$node = $rightnode;
# make sure outer loop will continue parsing *after* rightnode
$i += 2;
}
}
}
# recur
if ($node->{type} eq 'GROUP' ||
$node->{type} eq 'COMMAND') {
$parser->_stage3($node->{children}, $node);
}
}
}
# this stage finds \begin{x} \end{x} environments and shoves their contents
# down into a new child node, with a parent node of ENVIRONMENT type.
#
# this has the effect of making the tree deeper, since much of the structure
# is in environment tags and will now be picked up.
#
# for ENVIRONMENTs, "start" means the ending } on the \begin tag,
# "end" means the starting \ on the \end tag,
# "ostart" is the starting \ on the "begin" tag,
# "oend" is the ending } on the "end" tag, and
# and "class" is the "x" from above.
#
sub _stage4 {
my $parser = shift;
my $tree = shift;
my $bcount = 0; # \begin "stack count"
my $class = ""; # environment class
my $bidx = 0; # \begin array index.
for (my $i = 0; $i < @{$tree->{nodes}}; $i++) {
my $node = $tree->{nodes}->[$i];
# see if this is a "\begin" command node
if ($node->{type} eq 'COMMAND' && $node->{command} eq 'begin') {
#warn "parseStage4: found a begin COMMAND node, ".$node->{children}->{nodes}[0]->{content}." @ $node->{start}";
# start a new "stack"
if ($bcount == 0) {
$bidx = $i;
$bcount++;
$class = $node->{children}->{nodes}->[0]->{content};
#warn "parseStage4: opening environment tag found, class = $class";
}
# add to the "stack"
elsif ($node->{children}->{nodes}->[0]->{content} eq $class) {
$bcount++;
#warn "parseStage4: incrementing tag count for $class";
}
}
# handle "\end" command nodes
elsif ($node->{type} eq 'COMMAND' &&
$node->{command} eq 'end' &&
$node->{children}->{nodes}->[0]->{content} eq $class) {
$bcount--;
#warn "parseStage4: decrementing tag count for $class";
# we found our closing "\end" tag. replace everything with the proper
# ENVIRONMENT tag and subtree.
#
if ($bcount == 0) {
#warn "parseStage4: closing environment $class";
# first we must take everything between the "\begin" and "\end"
# nodes and put them in a new array, removing them from the old one
my @newarray = splice @{$tree->{nodes}}, $bidx+1, $i - ($bidx + 1);
# make the ENVIRONMENT node
my $start = $tree->{nodes}[$bidx]->{end};
my $end = $node->{start};
my $envnode = LaTeX::TOM::Node->new(
{type => 'ENVIRONMENT',
class => $class,
start => $start, # "inner" start and end
end => $end,
ostart => $start - length('begin') - length($class) - 2,
oend => $end + length('end') + length($class) + 2,
children => LaTeX::TOM::Tree->new([@newarray], $parser),
});
if ($parser->{config}{MATHENVS}->{$envnode->{class}}) {
$envnode->{math} = 1;
}
# replace the \begin and \end COMMAND nodes with the single
# environment node
splice @{$tree->{nodes}}, $bidx, 2, $envnode;
$class = ""; # reset class.
# i is going to change by however many nodes we removed
$i -= scalar @newarray;
# recur into the children
$parser->_stage4($envnode->{children});
}
}
# recur in general
elsif ($node->{children}) {
$parser->_stage4($node->{children});
}
}
# parse error if we're missing an "\end" tag.
if ($bcount > 0) {
if ($parser->{PARSE_ERRORS_FATAL} == 1) {
die "parse error: missing \\end{$class} for \\begin{$class} at position ".$tree->{nodes}[$bidx]->{end}." !";
} elsif ($parser->{PARSE_ERRORS_FATAL} == 0) {
warn "parse error: missing \\end{$class} for \\begin{$class} at position ".$tree->{nodes}[$bidx]->{end}." !";
}
}
}
# This is the "math" stage: here we grab simple-delimeter math modes from
# the text they are embedded in, and turn those into new groupings, with the
# "math" flag set.
#
# having this top level to go over all the bracket types prevents some pretty
# bad combinatorial explosion
#
sub _stage5 {
my $parser = shift;
my $tree = shift;
my $caremath = shift || 0;
my $brackets = $parser->{MATHBRACKETS};
# loop through all the different math mode bracket types
foreach my $left (sort {length($b) <=> length($a)} keys %$brackets) {
my $right = $brackets->{$left};
$parser->_stage5_r($tree, $left, $right, $caremath);
}
}
# recursive meat of above
#
sub _stage5_r {
my $parser = shift;
my $tree = shift;
my $left = shift;
my $right = shift;
my $caremath = shift || 0; # do we care if we're already in math mode?
# this matters for \( \), \[ \]
my $leftpos = -1; # no text pos for found left brace yet.
my $leftidx = -1; # no array index for found left brace yet.
# loop through the nodes
for (my $i = 0; $i < scalar @{$tree->{nodes}}; $i++) {
my $node = $tree->{nodes}[$i];
my $pos = 0; # position placeholder within the text block
my $spos = $node->{start}; # get text start position
if ($node->{type} eq 'TEXT' &&
(!$caremath || (!$node->{math} && $caremath))) {
# search for left brace if we haven't started a pair yet
if ($leftidx == -1) {
$leftpos = _findsymbol($node->{content}, $left, $pos);
if ($leftpos != -1) {
#print "found (left) $left in [$node->{content}]\n";
$leftidx = $i;
$pos = $leftpos + 1; # next pos to search from
}
}
# search for a right brace
if ($leftpos != -1) {
my $rightpos = _findsymbol($node->{content}, $right, $pos);
# found
if ($rightpos != -1) {
# we have to split the text node into 3 parts
if ($leftidx == $i) {
#print "splitwithin: found (right) $right in [$node->{content}]\n";
my ($leftnode, $textnode3) = $node->split($rightpos, $rightpos + length($right) - 1);
my ($textnode1, $textnode2) = $leftnode->split($leftpos, $leftpos + length($left) - 1);
my $startpos = $spos; # get text start position
# make the math ENVIRONMENT node
my $mathnode = LaTeX::TOM::Node->new(
{type => 'ENVIRONMENT',
class => $left, # use left delim as class
math => 1,
start => $startpos + $leftpos,
ostart => $startpos + $leftpos - length($left) + 1,
end => $startpos + $rightpos,
oend => $startpos + $rightpos + length($right) - 1,
children => LaTeX::TOM::Tree->new([$textnode2], $parser),
});
splice @{$tree->{nodes}}, $i, 1, $textnode1, $mathnode, $textnode3;
$i++; # skip ahead two nodes, so we'll be parsing textnode3
}
# split across nodes
else {
#print "splitacross: found (right) $right in [$node->{content}]\n";
# create new set of 4 smaller text nodes from the original two
# that contain the left and right delimeters
#
my ($textnode1, $textnode2) = $tree->{nodes}[$leftidx]->split($leftpos, $leftpos + length($left) - 1);
my ($textnode3, $textnode4) = $tree->{nodes}[$i]->split($rightpos, $rightpos + length($right) - 1);
# nodes to remove "from the middle" (between the left and right
# text nodes which contain the delimeters)
#
my @remnodes = splice @{$tree->{nodes}}, $leftidx+1, $i - $leftidx - 1;
# create a math node that contains the text after the left brace,
# then all the nodes up until the next text node, then the text
# before the right brace.
#
my $mathnode = LaTeX::TOM::Node->new(
{type => 'ENVIRONMENT',
class => $left,
math => 1,
start => $textnode2->{start} - 1,
end => $textnode3->{end} + 1,
ostart => $textnode2->{start} - 1 - length($left) + 1,
oend => $textnode3->{end} + 1 + length($right) - 1,
children => LaTeX::TOM::Tree->new(
[$textnode2,
@remnodes,
$textnode3], $parser),
});
# replace (TEXT_A, ... , TEXT_B) with the mathnode created above
splice @{$tree->{nodes}}, $leftidx, 2, $textnode1, $mathnode, $textnode4;
# do all nodes again but the very leftmost
#
$i = $leftidx;
}
$leftpos = -1; # reset left data
$leftidx = -1;
} # right brace
} # left brace
else {
my $rightpos = _findsymbol($node->{content}, $right, $pos);
if ($rightpos != -1) {
my $startpos = $node->{start}; # get text start position
if ($parser->{PARSE_ERRORS_FATAL} == 1) {
die "parse error: unmatched '$right' at ".($startpos+$rightpos).".";
} elsif ($parser->{PARSE_ERRORS_FATAL} == 0) {
warn "parse error: unmatched '$right' at ".($startpos+$rightpos).".";
}
}
}
} # if TEXT
# recur, but not into verbatim environments!
#
elsif ($node->{children} &&
!(
($node->{type} eq 'COMMAND' && $node->{command} =~ /^verb/) ||
($node->{type} eq 'ENVIRONMENT' && $node->{class} =~ /^verbatim/))) {
#print "Recurring into $node->{type} node ";
#print "$node->{command}" if ($node->{type} eq 'COMMAND');
#print "$node->{class}" if ($node->{type} eq 'ENVIRONMENT');
#print "\n";
$parser->_stage5_r($node->{children}, $left, $right, $caremath);
}
} # loop over text blocks
if ($leftpos != -1) {
my $startpos = $tree->{nodes}[$leftidx]->{start}; # get text start position
if ($parser->{PARSE_ERRORS_FATAL} == 1) {
die "parse error: unmatched '$left' at ".($startpos+$leftpos).".";
} elsif ($parser->{PARSE_ERRORS_FATAL} == 0) {
warn "parse error: unmatched '$left' at ".($startpos+$leftpos).".";
}
}
}
# This stage propegates the math mode flag and plaintext flags downward.
#
# After this is done, we can make the claim that only text nodes marked with
# the plaintext flag should be printed. math nodes will have the "math" flag,
# and also plantext = 0.
#
sub _propegateModes {
my $parser = shift;
my $tree = shift;
my $math = shift; # most likely want to call this with 0
my $plaintext = shift; # ditto this-- default to nothing visible.
foreach my $node (@{$tree->{nodes}}) {
# handle text nodes on this level. set flags.
#
if ($node->{type} eq 'TEXT') {
$node->{math} = $math;
$node->{plaintext} = $plaintext;
}
# propegate flags downward, possibly modified
#
elsif (defined $node->{children}) {
my $mathflag = $math; # math propegates down by default
my $plaintextflag = 0; # plaintext flag does NOT propegate by default
# handle math or plain text forcing envs
#
if ($node->{type} eq 'ENVIRONMENT' || $node->{type} eq 'COMMAND') {
if (defined $node->{class} && (
$parser->{config}{MATHENVS}->{$node->{class}} ||
$parser->{config}{MATHENVS}->{"$node->{class}*"})
)
{
$mathflag = 1;
$plaintextflag = 0;
}
elsif (($node->{type} eq 'COMMAND' &&
($parser->{config}{TEXTENVS}->{$node->{command}} ||
$parser->{config}{TEXTENVS}->{"$node->{command}*"})) ||
($node->{type} eq 'ENVIRONMENT' &&
($parser->{config}{TEXTENVS}->{$node->{class}} ||
$parser->{config}{TEXTENVS}{"$node->{command}*"}))
) {
$mathflag = 0;
$plaintextflag = 1;
}
}
# groupings change nothing
#
elsif ($node->{type} eq 'GROUP') {
$mathflag = $math;
$plaintextflag = $plaintext;
}
# recur
$parser->_propegateModes($node->{children}, $mathflag, $plaintextflag);
}
}
}
# apply a mapping to text nodes in a tree
#
# for newcommands and defs: mapping is a hash:
#
# {name, nparams, template, type}
#
# name is a string
# nparams is an integer
# template is a tree fragement containing text nodes with #x flags, where
# parameters will be replaced.
# type is "command"
#
# for newenvironments:
#
# {name, nparams, btemplate, etemplate, type}
#
# same as above, except type is "environment" and there are two templates,
# btemplate and etemplate.
#
sub _applyMapping {
my $parser = shift;
my $tree = shift;
my $mapping = shift;
my $i = shift || 0; # index to start with, in tree.
my $applications = 0; # keep track of # of applications
for (; $i < @{$tree->{nodes}}; $i++) {
my $node = $tree->{nodes}[$i];
# begin environment nodes
#
if ($node->{type} eq 'COMMAND' &&
$node->{command} eq 'begin' &&
$node->{children}->{nodes}[0]->{content} eq $mapping->{name}) {
# grab the nparams next group nodes as parameters
#
my @params = ();
my $remain = $mapping->{nparams};
my $j = 1;
while ($remain > 0 && ($i + $j) < scalar @{$tree->{nodes}}) {
my $node = $tree->{nodes}[$i + $j];
# grab group node
if ($node->{type} eq 'GROUP') {
push @params, $node->{children};
$remain--;
}
$j++;
}
# if we didn't get enough group nodes, bomb out
next if $remain;
# otherwise make new subtree
my $applied = _applyParamsToTemplate($mapping->{btemplate}, @params);
# splice in the result
splice @{$tree->{nodes}}, $i, $j, @$applied;
# skip past all the new stuff
$i += scalar @$applied - 1;
}
# end environment nodes
#
elsif ($node->{type} eq 'COMMAND' &&
$node->{command} eq 'end' &&
$node->{children}->{nodes}[0]->{content} eq $mapping->{name}) {
# make new subtree (no params)
my $applied = $mapping->{etemplate}->copy();
# splice in the result
splice @{$tree->{nodes}}, $i, 1, @$applied;
# skip past all the new stuff
$i += scalar @$applied - 1;
$applications++; # only count end environment nodes
}
# newcommand nodes
#
elsif ($node->{type} eq 'COMMAND' &&
$node->{command} eq $mapping->{name} &&
$mapping->{nparams}) {
my @params = ();
# children of COMMAND node will be first parameter
push @params, $node->{children};
# find next nparams GROUP nodes and push their children onto @params
my $remain = $mapping->{nparams} - 1;
my $j = 1;
while ($remain > 0 && ($i + $j) < scalar @{$tree->{nodes}}) {
my $node = $tree->{nodes}[$i + $j];
# grab group node
if ($node->{type} eq 'GROUP') {
push @params, $node->{children};
$remain--;
}
$j++;
}
# if we didn't get enough group nodes, bomb out
next if ($remain > 0);
# apply the params to the template
my $applied = _applyParamsToTemplate($mapping->{template}, @params);
# splice in the result
splice @{$tree->{nodes}}, $i, $j, @$applied;
# skip past all the new stuff
$i += scalar @$applied - 1;
$applications++;
}
# find 0-param mappings
elsif ($node->{type} eq 'TEXT' && !$mapping->{nparams}) {
my $text = $node->{content};
my $command = $mapping->{name};
# find occurrences of the mapping command
#
my $wordend = ($command =~ /\w$/ ? 1 : 0);
while (($wordend && $text =~ /\\\Q$command\E(\W|$)/g) ||
(!$wordend && $text =~ /\\\Q$command\E/g)) {
#warn "found occurrence of mapping $command";
my $idx = index $node->{content}, '\\' . $command, 0;
# split the text node at that command
my ($leftnode, $rightnode) = $node->split($idx, $idx + length($command));
# copy the mapping template
my $applied = $mapping->{template}->copy();
# splice the new nodes in
splice @{$tree->{nodes}}, $i, 1, $leftnode, @$applied, $rightnode;
# adjust i so we end up on rightnode when we're done
$i += scalar @$applied + 1;
# get the next node
$node = $tree->{$node}[$i];
# count application
$applications++;
}
}
# recur
elsif ($node->{children}) {
$applications += $parser->_applyMapping($node->{children}, $mapping);
}
}
return $applications;
}
# find and apply all mappings in the tree, progressively and recursively.
# a mapping applies to the entire tree and subtree consisting of nodes AFTER
# itself in the level array.
#
sub _applyMappings {
my $parser = shift;
my $tree = shift;
for (my $i = 0; $i < @{$tree->{nodes}}; $i++) {
my $prev = $tree->{nodes}[$i-1];
my $node = $tree->{nodes}[$i];
# find newcommands
if ($node->{type} eq 'COMMAND' &&
$node->{command} =~ /^(re)?newcommand$/) {
my $mapping = _makeMapping($tree, $i);
next if (!$mapping->{name}); # skip fragged commands
if ($parser->{USED_COMMANDS}->{$mapping->{name}}) {
#print "applying (nc) mapping $mapping->{name}\n";
} else {
#print "NOT applying (nc) mapping $mapping->{name}\n";
next;
}
# add to mappings list
#
$parser->{MAPPEDCMDS}->{"\\$mapping->{name}"} = 1;
#print "found a mapping with name $mapping->{name}, $mapping->{nparams} params\n";
# remove the mapping declaration
#
splice @{$tree->{nodes}}, $i, $mapping->{skip} + 1;
# apply the mapping
my $count = $parser->_applyMapping($tree, $mapping, $i);
#if ($count > 0) {
# print "printing altered subtree\n";
# $tree->print();
#}
$i--; # since we removed the cmd node, check this index again
}
# handle "\newenvironment" mappings
elsif ($node->{type} eq 'COMMAND' &&
$node->{command} =~ /^(re)?newenvironment$/) {
# make a mapping hash
#
my $mapping = $parser->_makeEnvMapping($tree, $i);
next if (!$mapping->{name}); # skip fragged commands.
#print "applying (ne) mapping $mapping->{name}\n";
# remove the mapping declaration
#
splice @{$tree->{nodes}}, $i, $mapping->{skip} + 1;
# apply the mapping
#
my $count = $parser->_applyMapping($tree, $mapping, $i);
}
# handle "\def" stype commands.
elsif ($node->{type} eq 'COMMAND' &&
defined $prev &&
$prev->{type} eq 'TEXT' &&
$prev->{content} =~ /\\def\s*$/o) {
#print "found def style mapping $node->{command}\n";
# remove the \def
$prev->{content} =~ s/\\def\s*$//o;
# make the mapping
my $mapping = {name => $node->{command},
nparams => 0,
template => $node->{children}->copy(),
type => 'command'};
next if (!$mapping->{name}); # skip fragged commands
if ($parser->{USED_COMMANDS}->{$mapping->{name}}) {
#print "applying (def) mapping $mapping->{name}\n";
} else {
#print "NOT applying (def) mapping $mapping->{name}\n";
next;
}
# add to mappings list
#
$parser->{MAPPEDCMDS}->{"\\$mapping->{name}"} = 1;
#print "template is \n";
#$mapping->{template}->print();
# remove the command node
splice @{$tree->{nodes}}, $i, 1;
# apply the mapping
my $count = $parser->_applyMapping($tree, $mapping, $i);
$i--; # check this index again
}
# recur
elsif ($node->{children}) {
$parser->_applyMappings($node->{children});
}
}
}
# read files from \input commands and place into the tree, parsed
#
# also include bibliographies
#
sub _addInputs {
my $parser = shift;
my $tree = shift;
for (my $i = 0; $i < @{$tree->{nodes}}; $i++) {
my $node = $tree->{nodes}[$i];
if ($node->{type} eq 'COMMAND' &&
$node->{command} eq 'input') {
my $file = $node->{children}->{nodes}[0]->{content};
next if ($file =~ /pstex/); # ignore pstex images
#print "reading input file $file\n";
# read in contents of file
my $contents = _readFile($file);
if ($contents eq '' && not $file =~ /\.tex$/) {
$file = "$file.tex";
$contents = _readFile($file);
}
# dump Psfig/TeX files, they aren't useful to us and have
# nonconforming syntax. Use declaration line as our heuristic.
#
if ($contents =~ /^\%\s*Psfig\/TeX\s*$/m) {
$contents = '';
warn "ignoring Psfig input '$file'";
}
# actually do the parse of the sub-content
#
if ($contents) {
# parse into a tree
my ($subtree,) = $parser->_basicparse($contents, $parser->{PARSE_ERRORS_FATAL});
# replace \input command node with subtree
splice @{$tree->{nodes}}, $i, 1, @{$subtree->{nodes}};
# step back
$i--;
}
}
elsif ($node->{type} eq 'COMMAND' &&
$node->{command} eq 'bibliography') {
# try to find a .bbl file
#
my @FILES = <*>;
foreach my $file (@FILES) {
if ($file =~ /\.bbl$/) {
my $contents = _readFile($file);
if ($contents) {
my ($subtree,) = $parser->_basicparse($contents, $parser->{PARSE_ERRORS_FATAL});
splice @{$tree->{nodes}}, $i, 1, @$subtree;
$i--;
}
}
}
}
# recur
if ($node->{children}) {
$parser->_addInputs($node->{children});
}
}
}
# do pre-mapping parsing
#
sub _parseA {
my $parser = shift;
my $text = shift;
my $tree = $parser->_stage1($text);
my $bracehash = $parser->_stage2($tree);
$parser->_stage3($tree);
$parser->_addInputs($tree) if $parser->{readinputs};
return ($tree, $bracehash);
}
# do post-mapping parsing (make environments)
#
sub _parseB {
my $parser = shift;
my $tree = shift;
$parser->_stage4($tree);
#print "done with parseStage4\n";
$parser->_stage5($tree, 0);
#print "done with parseStage5\n";
}
###############################################################################
#
# Parser "Static" Subroutines
#
###############################################################################
# find next unescaped char in some text
#
sub _uindex {
my $text = shift;
my $char = shift;
my $pos = shift;
my $realbrace = 0;
my $idx = -1;
# get next opening brace
do {
$realbrace = 1;
$idx = index $text, $char, $pos;
if ($idx != -1) {
$pos = $idx + 1;
my $prevchar = substr $text, $idx - 1, 1;
if ($prevchar eq '\\') {
$realbrace = 0;
$idx = -1;
}
}
} while (!$realbrace);
return $idx;
}
# support function: find the next occurrence of some symbol which is
# not escaped.
#
sub _findsymbol {
my $text = shift;
my $symbol = shift;
my $pos = shift;
my $realhit = 0;
my $index = -1;
# get next occurrence of the symbol
do {
$realhit = 1;
$index = index $text, $symbol, $pos;
if ($index != -1) {
$pos = $index + 1;
# make sure this occurrence isn't escaped. this is imperfect.
#
my $prevchar = ($index - 1 >= 0) ?
(substr $text, $index - 1, 1) : '';
my $pprevchar = ($index - 2 >= 0) ?
(substr $text, $index - 2, 1) : '';
if ($prevchar eq '\\' && $pprevchar ne '\\') {
$realhit = 0;
$index = -1;
}
}
} while (!$realhit);
return $index;
}
# support function: find the earliest next brace in some (flat) text
#
sub _findbrace {
my $text = shift;
my $pos = shift;
my $realbrace = 0;
my $index_o = -1;
my $index_c = -1;
my $pos_o = $pos;
my $pos_c = $pos;
# get next opening brace
do {
$realbrace = 1;
$index_o = index $text, '{', $pos_o;
if ($index_o != -1) {
$pos_o = $index_o + 1;
# make sure this brace isn't escaped. this is imperfect.
#
my $prevchar = ($index_o - 1 >= 0) ?
(substr $text, $index_o - 1, 1) : '';
my $pprevchar = ($index_o - 2 >= 0) ?
(substr $text, $index_o - 2, 1) : '';
if ($prevchar eq '\\' && $pprevchar ne '\\') {
$realbrace = 0;
$index_o = -1;
}
}
} while (!$realbrace);
# get next closing brace
do {
$realbrace = 1;
$index_c = index $text, '}', $pos_c;
if (($index_c - 1) >= 0 && substr($text, $index_c - 1, 1) eq ' ') {
$pos_c = $index_c + 1;
$index_c = -1;
}
if ($index_c != -1) {
$pos_c = $index_c + 1;
# make sure this brace isn't escaped. this is imperfect.
#
my $prevchar = ($index_c - 1 >= 0) ?
(substr $text, $index_c - 1, 1) : '';
my $pprevchar = ($index_c - 2 >= 0) ?
(substr $text, $index_c - 2, 1) : '';
if ($prevchar eq '\\' && $pprevchar ne '\\') {
$realbrace = 0;
$index_c = -1;
}
}
} while (!$realbrace);
# handle all find cases
return (-1, '') if ($index_o == -1 && $index_c == -1);
return ($index_o, '{') if ($index_c == -1 ||
($index_o != -1 && $index_o < $index_c));
return ($index_c, '}') if ($index_o == -1 || $index_c < $index_o);
}
# skip "blank nodes" in a tree, starting at some position. will finish
# at the first non-blank node. (ie, not a comment or whitespace TEXT node.
#
sub _skipBlankNodes {
my $tree = shift;
my $i = shift;
while ($tree->{nodes}[$i]->{type} eq 'COMMENT' ||
($tree->{nodes}[$i]->{type} eq 'TEXT' &&
$tree->{nodes}[$i]->{content} =~ /^\s*$/s)) {
$i++;
}
return $i;
}
# is the passed-in node a valid parameter node? for this to be true, it must
# either be a GROUP or a position = inner command.
#
sub _validParamNode {
my $node = shift;
return 1 if ($node->{type} eq 'GROUP' ||
($node->{type} eq 'COMMAND' && $node->{position} eq 'inner'));
return 0;
}
# duplicate a valid param node. This means for a group, copy the child tree.
# for a command, make a new tree with just the command node and its child tree.
#
sub _duplicateParam {
my $parser = shift;
my $node = shift;
if ($node->{type} eq 'GROUP') {
return $node->{children}->copy();
}
elsif ($node->{type} eq 'COMMAND') {
my $subtree = $node->{children}->copy(); # copy child subtree
my $nodecopy = $node->copy(); # make a new node with old data
$nodecopy->{children} = $subtree; # set the child pointer to new subtree
# return a new tree with the new node (subtree) as its only element
return LaTeX::TOM::Tree->new([$nodecopy], $parser);
}
return undef;
}
# make a mapping from a newenvironment fragment
#
# newenvironments have the following syntax:
#
# \newenvironment{name}[nparams]?{beginTeX}{endTeX}
#
sub _makeEnvMapping {
my $parser = shift;
my $tree = shift;
my $i = shift;
return undef if ($tree->{nodes}[$i]->{type} ne 'COMMAND' ||
($tree->{nodes}[$i]->{command} ne 'newenvironment' &&
$tree->{nodes}[$i]->{command} ne 'renewenvironment'));
# figure out command (first child, text node)
my $command = $tree->{nodes}[$i]->{children}->{nodes}[0]->{content};
if ($command =~ /^\s*\\(\S+)\s*$/) {
$command = $1;
}
my $next = $i+1;
# figure out number of params
my $nparams = 0;
if ($tree->{nodes}[$next]->{type} eq 'TEXT') {
my $text = $tree->{nodes}[$next]->{content};
if ($text =~ /^\s*\[\s*([0-9])+\s*\]\s*$/) {
$nparams = $1;
}
$next++;
}
# default templates-- just repeat the declarations
#
my ($btemplate) = $parser->_basicparse("\\begin{$command}", 2, 0);
my ($etemplate) = $parser->_basicparse("\\end{$command}", 2, 0);
my $endpos = $next;
# get two group subtrees... one for the begin and one for the end
# templates. we only ignore whitespace TEXT nodes and comments
#
$next = _skipBlankNodes($tree, $next);
if (_validParamNode($tree->{nodes}[$next])) {
$btemplate = $parser->_duplicateParam($tree->{nodes}[$next]);
$next++;
$next = _skipBlankNodes($tree, $next);
if (_validParamNode($tree->{nodes}[$next])) {
$etemplate = $parser->_duplicateParam($tree->{nodes}[$next]);
$endpos = $next;
}
}
# build and return the mapping hash
#
return {name => $command,
nparams => $nparams,
btemplate => $btemplate, # begin template
etemplate => $etemplate, # end template
skip => $endpos - $i,
type => 'environment'};
}
# make a mapping from a newcommand fragment
# takes tree pointer and index of command node
#
# newcommands have the following syntax:
#
# \newcommand{\name}[nparams]?{anyTeX}
#
sub _makeMapping {
my $tree = shift;
my $i = shift;
return undef if ($tree->{nodes}[$i]->{type} ne 'COMMAND' ||
($tree->{nodes}[$i]->{command} ne 'newcommand' &&
$tree->{nodes}[$i]->{command} ne 'renewcommand'));
# figure out command (first child, text node)
my $command = $tree->{nodes}[$i]->{children}->{nodes}[0]->{content};
if ($command =~ /^\s*\\(\S+)\s*$/) {
$command = $1;
}
my $next = $i+1;
# figure out number of params
my $nparams = 0;
if ($tree->{nodes}[$next]->{type} eq 'TEXT') {
my $text = $tree->{nodes}[$next]->{content};
if ($text =~ /^\s*\[\s*([0-9])+\s*\]\s*$/) {
$nparams = $1;
}
$next++;
}
# grab subtree template (array ref)
#
my $template;
if ($tree->{nodes}[$next]->{type} eq 'GROUP') {
$template = $tree->{nodes}[$next]->{children}->copy();
} else {
return undef;
}
# build and return the mapping hash
#
return {name => $command,
nparams => $nparams,
template => $template,
skip => $next - $i,
type => 'command'};
}
# this sub is the main entry point for the sub that actually takes a set of
# parameter trees and inserts them into a template tree. the return result,
# newly allocated, should be plopped back into the original tree where the
# parameters (along with the initial command invocation)
#
sub _applyParamsToTemplate {
my $template = shift;
my @params = @_;
# have to copy the template to a freshly allocated tree
#
my $applied = $template->copy();
# now recursively apply the params.
#
_applyParamsToTemplate_r($applied, @params);
return $applied;
}
# recursive helper for above
#
sub _applyParamsToTemplate_r {
my $template = shift;
my @params = @_;
for (my $i = 0; $i < @$template; $i++) {
my $node = $template->[$i];
if ($node->{type} eq 'TEXT') {
my $text = $node->{content};
# find occurrences of the parameter flags
#
if ($text =~ /(#([0-9]+))/) {
my $all = $1;
my $num = $2;
# get the index of the flag we just found
#
my $idx = index $text, $all, 0;
# split the node on the location of the flag
#
my ($leftnode, $rightnode) = $node->split($idx, $idx + length($all) - 1);
# make a copy of the param we want
#
my $param = $params[$num - 1]->copy();
# splice the new text nodes, along with the parameter subtree, into
# the old location
#
splice @$template, $i, 1, $leftnode, @$param, $rightnode;
# skip forward to where $rightnode is in $template on next iteration
#
$i += scalar @$param;
}
}
# recur
elsif (defined $node->{children}) {
_applyParamsToTemplate_r($node->{children}, @params);
}
}
}
# This sub takes a chunk of the document text between two points and makes
# it into a list of TEXT nodes and COMMENT nodes, as we would expect from
# '%' prefixed LaTeX comment lines
#
sub _getTextAndCommentNodes {
my $text = shift;
my $begins = shift;
my $ends = shift;
my $nodetext = substr $text, $begins, $ends - $begins;
#warn "getTextAndCommentNodes: looking at [$nodetext]";
my @lines = split (/((?: *(?<!\\)\%[^\n]*\n)+)/m, $nodetext);
my @nodes = ();
my $startpos = 0;
my @out = ();
my $comment = 0; # comment mode
my $first = 1;
foreach my $line (@lines) {
my $mode;
# set current mode based on whether this is a comment
if ($line =~ /^\s*\%/ && $nodetext !~ /\\begin\{verbatim\}.*\Q$line\E.*\\end\{verbatim\}/s) {
$mode = 1; # comment
} else {
$mode = 0; # not-comment (regular text)
}
# if mode stays the same, add to output and do nothing
if ($mode == $comment || $first) {
push @out, $line;
# handle turning off initialization stuff
if ($first) {
$first = 0;
$comment = $mode;
}
}
# if mode changes, make new node from current chunk, change mode
# flag, and start a new chunk
else {
my $output = join ('', @out);
my $newnode = LaTeX::TOM::Node->new(
{type => $comment ? 'COMMENT' : 'TEXT',
start => $begins + $startpos,
end => $begins + $startpos + length($output) - 1,
content => $output});
push @nodes, $newnode;
$startpos += length($output); # update start position
@out = ($line);
$comment = $mode;
}
}
# make node from last chunk
my $output = join('', @out);
my $newnode = LaTeX::TOM::Node->new(
{type => $comment ? 'COMMENT' : 'TEXT',
start => $begins + $startpos,
end => $begins + $startpos + length($output) - 1,
content => $output});
push @nodes, $newnode;
return @nodes;
}
# Read in the contents of a text file on disk. Return in string scalar.
#
sub _readFile {
my $file = shift || '/dev/stdin';
my $contents = "";
open INFILE, $file or return "";
while (my $line = <INFILE>) {
$contents .= $line;
}
close INFILE;
return $contents;
}
1;